In [0]:
# Basic imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

plt.style.use('ggplot')
sns.set_style('white')

In [0]:
# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
# Get the file
downloaded_train = drive.CreateFile({'id':'1qQNzv_3hPCRECoaLypsOM_hGbF99_pyp'})
downloaded_train.GetContentFile('train_HousePrice_Kaggle.csv')

downloaded_test = drive.CreateFile({'id':'1be8lYYutY-7xLXz8ROj7YCJLFHpRv9ah'}) 
downloaded_test.GetContentFile('test_HousePrice_Kaggle.csv')

In [0]:
# Read file as panda dataframe
import pandas as pd
df_train = pd.read_csv('train_HousePrice_Kaggle.csv', index_col='Id')
df_test = pd.read_csv('test_HousePrice_Kaggle.csv', index_col='Id')

In [0]:
# Inspect
df_train.head(2)

df_test.head(2)

set(df_train.columns).symmetric_difference(set(df_test.columns))

In [0]:
# Initial EDA

columns = list(df_train.columns)

# column_dict = {}
# for col in columns:
#   column_dict[col] = [df_train[col].isnull().sum(), df_train[col].dtype]

In [0]:
# Corr heatmap for numerical vars

plt.figure(figsize=(20,15))
sns.heatmap(df_train.corr())

In [0]:
# LinReg using just highest corr numerical vars

from sklearn.linear_model import LinearRegression

model = LinearRegression(normalize=False)
X = df_train[['OverallQual', 'GrLivArea', 'TotalBsmtSF', '1stFlrSF', 'GarageCars', 'GarageArea']]
y = df_train.SalePrice
model.fit(X, y)

from sklearn.metrics import r2_score
y_pred = model.predict(X)

print('\n')
r2_score(y, y_pred)
print('\n')


# LinReg using all numerical vars

numerical_features = list(df_train.corr().columns)
numerical_features.remove('SalePrice')

X = df_train[numerical_features]
y = df_train.SalePrice

from sklearn.preprocessing import Imputer
imp = Imputer()
X = imp.fit_transform(X)

model = LinearRegression(normalize=False)
model.fit(X, y)

y_pred = model.predict(X)

print('\n')
r2_score(y, y_pred)

In [0]:
# EDA on numerical vars

sns.distplot(df_train['SalePrice'], kde=True)

# for feat in numerical_features:
#   plt.figure()
#   sns.distplot(df_train[feat].fillna(-1), kde=False)

In [0]:
# Categorical numerical vars

cat_num_vars = ['OverallQual', 'OverallCond', 'BsmtFullBath', 'FullBath', \
                'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', \
               'Fireplaces', 'GarageCars', 'MoSold', 'YrSold']

# for var in cat_num_vars:
#   df_train[var].value_counts()

In [0]:
# Categorical vars

categorical_features = list(df_train.drop(numerical_features, axis=1).columns)
categorical_features.remove('SalePrice')

# for feat in categorical_features:
#   df_train[feat].value_counts()
#   print('\n')
  
df_train_cat = pd.get_dummies(df_train[categorical_features], drop_first=True)
df_train_cat.info()


# Basic prediction using just categoricals & LinReg

X = df_train_cat
y = df_train.SalePrice

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)

print('\n')
r2_score(y, y_pred)

In [0]:
# Basic prediction using numericals & categoricals using LinReg

X = pd.concat([df_train_cat, df_train[numerical_features]], axis=1)
y = df_train.SalePrice

imp = Imputer()
X = imp.fit_transform(X)

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)

print('\n')
r2_score(y, y_pred)


# Train-test split w/in df_train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print('\n')

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

model = LinearRegression()
# model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print('\n')
print('Train r2:', r2_score(y_train, y_pred_train))

print('Test r2:', r2_score(y_test, y_pred_test))

In [0]:
# Initial Dimensionality Reduction & basic r2

from sklearn.decomposition import PCA
pca = PCA(10)

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

model = LinearRegression()
model.fit(X_train_pca, y_train)

y_pred_train = model.predict(X_train_pca)
y_pred_test = model.predict(X_test_pca)

print('\n')
print('Train r2:', r2_score(y_train, y_pred_train))

print('Test r2:', r2_score(y_test, y_pred_test))