## Supervised - Regression

In [51]:
import warnings
warnings.filterwarnings("ignore")
from colorama import Fore, Back, Style 
import pandas as pd
import os
import numpy as np
#import scattertext as st
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import ARDRegression
from sklearn.ensemble import BaggingRegressor
from scipy.cluster.hierarchy import dendrogram, linkage
#from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [52]:
data = pd.read_csv("https://raw.githubusercontent.com/vigneshjmurali/Statistical-Predictive-Modelling/master/Datasets/data2.csv")
#data.head(3)

In [53]:
#data.columns
#data.dtypes

## Preprocessing

In [54]:
vec = TfidfVectorizer()
X = vec.fit_transform(data['field'].values.astype('str'))
field = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
vec = TfidfVectorizer()
X = vec.fit_transform(data['gender'].values.astype('str'))
gender = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
vec = TfidfVectorizer()
X = vec.fit_transform(data['race'].values.astype('str'))
race = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
vec = TfidfVectorizer()
X = vec.fit_transform(data['race_o'].values.astype('str'))
race_o= pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
data = pd.merge(data, field, left_index=True, right_index=True)
data = pd.merge(data, gender, left_index=True, right_index=True)
data = pd.merge(data, race, left_index=True, right_index=True)
data = pd.merge(data, race_o, left_index=True, right_index=True)
data = data.drop(['field','gender','race','race_o'], axis = 1) #drops the now redundant columns

In [55]:
data.head()

Unnamed: 0.1,Unnamed: 0,has_null,wave,age,age_o,d_age,d_d_age,samerace,importance_same_race,importance_same_religion,...,american,asian_y,black_y,caucasian_y,european_y,hispanic_y,islander_y,latino_y,other_y,pacific_y
0,1,0,1,21,27,6,5,0,2,4,...,0.434931,0.0,0.0,0.636724,0.636724,0.0,0.0,0.0,0.0,0.0
1,2,0,1,21,22,1,1,0,2,4,...,0.434931,0.0,0.0,0.636724,0.636724,0.0,0.0,0.0,0.0,0.0
2,3,1,1,21,22,1,1,1,2,4,...,0.175081,0.803885,0.0,0.0,0.0,0.0,0.401943,0.0,0.0,0.401943
3,4,0,1,21,23,2,3,0,2,4,...,0.434931,0.0,0.0,0.636724,0.636724,0.0,0.0,0.0,0.0,0.0
4,5,0,1,21,24,3,3,0,2,4,...,0.20828,0.0,0.0,0.0,0.0,0.691599,0.0,0.691599,0.0,0.0


In [56]:
print(data.shape)
#list(data.columns)

(8378, 155)


## partition the data

In [57]:
y_data = data['match']
x_data = data.drop('match', axis=1)
xtrain, xtest, ytrain, ytest = train_test_split(x_data, y_data,random_state=9)
Xcolnames = list(x_data)
print(y_data.shape); print(x_data.shape)

(8378,)
(8378, 154)


## Linear Regression

In [58]:
lr = LinearRegression().fit(xtrain, ytrain)
print("Training set score: {:.2f}".format(lr.score(xtrain, ytrain)))
print("Test set score: {:.2f}".format(lr.score(xtest, ytest)))
scores = cross_val_score(lr, x_data, y_data, cv=5)
print(Fore.BLUE + "Average cross-validation score: {:.2f}".format(scores.mean()))

Training set score: 0.60
Test set score: 0.56
[34mAverage cross-validation score: 0.55


## Ridge Regression

In [59]:
ridge = Ridge().fit(xtrain, ytrain)
print("Training set score: {:.2f}".format(ridge.score(xtrain, ytrain)))
print("Test set score: {:.2f}".format(ridge.score(xtest, ytest)))
scores = cross_val_score(ridge, x_data, y_data, cv=5)
print(Fore.BLUE +"Average cross-validation score: {:.2f}".format(scores.mean()))

Training set score: 0.60
Test set score: 0.56
[34mAverage cross-validation score: 0.55


## KNN

In [60]:
# instantiate the model and set the number of neighbors to consider to 3:
reg = KNeighborsRegressor(n_neighbors=3)
# fit the model using the training data and training targets:
reg.fit(xtrain, ytrain)
print("Test set R^2: {:.2f}".format(reg.score(xtest, ytest)))
scores = cross_val_score(reg, x_data, y_data, cv=5)
print(Fore.BLUE +"Average cross-validation score: {:.2f}".format(scores.mean()))

Test set R^2: -0.19
[34mAverage cross-validation score: -0.36


## Lasso

In [61]:
lasso = Lasso().fit(xtrain, ytrain)
print("Training set score: {:.2f}".format(lasso.score(xtrain, ytrain)))
print("Test set score: {:.2f}".format(lasso.score(xtest, ytest)))
print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))
scores = cross_val_score(lasso, x_data,y_data, cv=5)
print(Fore.BLUE +"Average cross-validation score: {:.2f}".format(scores.mean()))

Training set score: 0.00
Test set score: -0.00
Number of features used: 1
[34mAverage cross-validation score: -0.00


## Decision Tree Regression

In [62]:
tree = DecisionTreeRegressor().fit(xtrain, ytrain)
print("Training set score: {:.2f}".format(tree.score(xtrain, ytrain)))
print("Test set score: {:.2f}".format(tree.score(xtest, ytest)))
scores = cross_val_score(tree, x_data,y_data, cv=5)
print(Fore.BLUE +"Average cross-validation score: {:.2f}".format(scores.mean()))

Training set score: 1.00
Test set score: 1.00
[34mAverage cross-validation score: 1.00


## Random Forest Regressor

In [63]:
rfr = RandomForestRegressor().fit(xtrain, ytrain)
print("Training set score: {:.2f}".format(rfr.score(xtrain, ytrain)))
print("Test set score: {:.2f}".format(rfr.score(xtest, ytest)))
scores = cross_val_score(rfr, x_data,y_data, cv=5)
print(Fore.BLUE +"Average cross-validation score: {:.2f}".format(scores.mean()))

Training set score: 1.00
Test set score: 1.00
[34mAverage cross-validation score: 1.00


## Bagging Regressor

In [64]:
bingr = BaggingRegressor().fit(xtrain, ytrain)
print("Training set score: {:.2f}".format(bingr.score(xtrain, ytrain)))
print("Test set score: {:.2f}".format(bingr.score(xtest, ytest)))
scores = cross_val_score(bingr, x_data,y_data, cv=5)
print(Fore.BLUE +"Average cross-validation score: {:.2f}".format(scores.mean()))

Training set score: 1.00
Test set score: 1.00
[34mAverage cross-validation score: 1.00


## Gaussian Process Regressor

In [65]:
gpr = GaussianProcessRegressor().fit(xtrain, ytrain)
print("Training set score: {:.2f}".format(gpr.score(xtrain, ytrain)))
print("Test set score: {:.2f}".format(gpr.score(xtest, ytest)))

Training set score: 1.00
Test set score: -0.18
