# <font color='Black'> Project 1 Regression

In [None]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#reading data
df=pd.read_csv('avocado2.csv')


In [None]:
df=df.sample(frac=.1,random_state=0)

# Data Exploration and Preprocessing

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
#finding correlations between numerical features
import seaborn as sns
df1=df.iloc[:,3:9]
f, ax = plt.subplots(figsize=(10, 8))
corr = df1.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

In [None]:
#displaying columns having null values
df.columns[df.isnull().any()]

In [None]:
#imputing NAN values with mean
df['AveragePrice'] = df.groupby('Type')['AveragePrice'].transform(lambda x: x.fillna(x.mean()))
df['XLarge Bags'] = df.groupby('Type')['XLarge Bags'].transform(lambda x: x.fillna(x.mean()))



In [None]:
#Mapping binary variables to 0 and 1 
df.Rain = df.Rain.replace({'No' : 0, 'Yes' : 1}) 
df.Snow = df.Snow.replace({'No' : 0, 'Yes' : 1})
df.Type = df.Type.replace({'conventional' : 0, 'organic' : 1}) 


In [None]:
#Replacing values with one hot vector
df = pd.get_dummies(df, columns = ['Year'], prefix = ['Year'])
df = pd.get_dummies(df, columns = ['Region'], prefix = ['Region'])
df = pd.get_dummies(df, columns = ['Month'], prefix = ['Month'])
df = pd.get_dummies(df, columns = ['Season'], prefix = ['Season'])

In [None]:
#Dropping unnecessary/ redundant columns
df.drop(['Date'],axis=1,inplace= True)
df.drop(['Sl.No'],axis=1,inplace= True)
df.drop(['Total Volume'],axis=1,inplace= True)
df.drop(['Total Bags'],axis=1,inplace= True)
df.drop(['Month Number'],axis=1,inplace= True)

In [None]:
# Creating Feature and target variable
y = df['AveragePrice']
X = df.drop(['AveragePrice'], axis = 1)
names = list(X.columns.values)

In [None]:
#Scaling & splitting into train and test dataset
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
X_train_org, X_test_org, y_train, y_test = train_test_split(X,y, random_state = 0)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_org)
X_test = scaler.transform(X_test_org)

StandardScaler removes the mean and scales the data to unit variance. However, the outliers have an influence when 
computing the empirical mean and standard deviation which shrink the range of the feature values Standard Scaler therefore 
cannot guarantee balanced feature scales in the presence of outliers.
MinMaxScaler rescales the data set such that all feature values are in the range [0, 1] As StandardScaler, MinMaxScaler is
very sensitive to the presence of outliers.
It doesn't really matter which scaling is used in this particular dataset

# Grid Search

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

X_train_org, X_test_org, y_train, y_test = train_test_split(X,y, random_state = 0)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_org)
X_test = scaler.transform(X_test_org)
print("Size of training set: {}   size of test set: {}".format(X_train.shape[0], X_test.shape[0]))

best_score = 0

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        # for each combination of parameters, train an SVR
        svm = SVR(gamma=gamma, C=C)
        svm.fit(X_train, y_train)
        # evaluate the SVC on the test set
        score = svm.score(X_test, y_test)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}

print("Best score: {:.2f}".format(best_score))
print("Best parameters: {}".format(best_parameters))


In [None]:
from sklearn.svm import SVR

y = df['AveragePrice']
X = df.drop(['AveragePrice'], axis = 1)
X_train_org, X_test_org, y_train, y_test = train_test_split(X,y, random_state = 0)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_org)
X_test = scaler.transform(X_test_org)

# split data into train+validation set and test set
X_trainval, X_test, y_trainval, y_test = train_test_split(X_train,y_train, random_state = 0)

# split train+validation set into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, random_state=1)


print("Size of training set: {}   size of validation set: {}   size of test set:"
      " {}\n".format(X_train.shape[0], X_valid.shape[0], X_test.shape[0]))

best_score = 0

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        # for each combination of parameters, train an SVC
        svm = SVR(gamma=gamma, C=C)
        svm.fit(X_train, y_train)
        # evaluate the SVC on the validation set
        score = svm.score(X_valid, y_valid)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}

# rebuild a model on the combined training and validation set,
# and evaluate it on the test set
svm = SVR(**best_parameters)
svm.fit(X_trainval, y_trainval)
test_score = svm.score(X_test, y_test)
print("Best score on validation set: {:.2f}".format(best_score))
print("Best parameters: ", best_parameters)
print("Test set score with best parameters: {:.2f}".format(test_score))

In [None]:
#Grid search with cross validation
from sklearn.model_selection import cross_val_score
import numpy as np

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        # for each combination of parameters,
        # train an SVC
        svm = SVR(gamma=gamma, C=C)
        # perform cross-validation
        scores = cross_val_score(svm, X_trainval, y_trainval, cv=5)
        # compute mean cross-validation accuracy
        score = np.mean(scores)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}
            
# rebuild a model on the combined training and validation set
svm = SVR(**best_parameters)
svm.fit(X_trainval, y_trainval)

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
print("Parameter grid:\n{}".format(param_grid))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
grid_search = GridSearchCV(SVR(), param_grid, cv=5, return_train_score=True)

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 0)
y = df['AveragePrice']
X = df.drop(['AveragePrice'], axis = 1)
X_train_org, X_test_org, y_train, y_test = train_test_split(X,y, random_state = 0)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_org)
X_test = scaler.transform(X_test_org)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

In [None]:
import pandas as pd
# convert to DataFrame
results = pd.DataFrame(grid_search.cv_results_)
# show the first 5 rows
display(results.head())

In [None]:
%matplotlib notebook
import mglearn
scores = np.array(results.mean_test_score).reshape(6, 6)

# plot the mean cross-validation scores
mglearn.tools.heatmap(scores, xlabel='gamma', xticklabels=param_grid['gamma'], ylabel='C', yticklabels=param_grid['C'], cmap="viridis")

# knn regressor

In [None]:
#knn
from sklearn.neighbors import KNeighborsRegressor
%matplotlib inline
train_score_array = []
test_score_array = []

for k in range(1,20):
    knn_reg = KNeighborsRegressor(k)
    knn_reg.fit(X_train, y_train)
    train_score_array.append(knn_reg.score(X_train, y_train))
    test_score_array.append(knn_reg.score(X_test, y_test))

x_axis = range(1,20)
plt.plot(x_axis, train_score_array, c = 'g', label = 'Train Score')
plt.plot(x_axis, test_score_array, c = 'b', label = 'Test Score')
plt.legend()
plt.xlabel('k')
plt.ylabel('MSE')

In [None]:
knn=KNeighborsRegressor()   
knn.fit(X_train,y_train)
    

In [None]:
#using cross-validation to find average training score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(knn, X_train,y_train, cv=5)
print("Cross-validation scores: {}".format(scores))

In [None]:
print("Average cross-validation score: {:.2f}".format(scores.mean()))

# Linear Regression

In [None]:
##Linear Regression
from sklearn.linear_model import LinearRegression
lreg = LinearRegression()
lreg.fit(X_train, y_train)


In [None]:
#using cross-validation to find average training score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lreg, X_train,y_train, cv=5)
print("Cross-validation scores: {}".format(scores))

In [None]:
print("Average cross-validation score: {:.2f}".format(scores.mean()))

In [None]:
#plotting fitted line
%matplotlib inline
import matplotlib.pyplot as plt

X_train_rm = X_train[:,5].reshape(-1,1)
lreg.fit(X_train_rm, y_train)
y_predict = lreg.predict(X_train_rm)

plt.plot(X_train_rm, y_predict, c = 'r')
plt.scatter(X_train_rm,y_train)
plt.xlabel('RM')

# Ridge regression

In [None]:
from  sklearn.linear_model import Ridge

x_range = [0.01, 0.1, 1, 10, 100]
train_score_list = []
test_score_list = []

for alpha in x_range: 
    ridge = Ridge(alpha)
    ridge.fit(X_train,y_train)
    train_score_list.append(ridge.score(X_train,y_train))
    test_score_list.append(ridge.score(X_test, y_test))

In [None]:
#plotting train and test scores for different values of alpha
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(x_range, train_score_list, c = 'g', label = 'Train Score')
plt.plot(x_range, test_score_list, c = 'b', label = 'Test Score')
plt.xscale('log')
plt.legend(loc = 3)
plt.xlabel(r'$\alpha$')

In [None]:
ridge = Ridge(alpha = 1)
ridge.fit(X_train,y_train)

In [None]:
#using cross-validation to find average training score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(ridge, X_train,y_train, cv=5)
print("Cross-validation scores: {}".format(scores))

In [None]:
print("Average cross-validation score: {:.2f}".format(scores.mean()))

In [None]:
%matplotlib inline
import numpy as np

x_range1 = np.linspace(0.001, 1, 100).reshape(-1,1)
x_range2 = np.linspace(1, 10000, 10000).reshape(-1,1)

x_range = np.append(x_range1, x_range2)
coeff = []

for alpha in x_range: 
    ridge = Ridge(alpha)
    ridge.fit(X_train,y_train)
    coeff.append(ridge.coef_ )
    
coeff = np.array(coeff)

for i in range(0,13):
    plt.plot(x_range, coeff[:,i], label = 'feature {:d}'.format(i))

plt.axhline(y=0, xmin=0.001, xmax=9999, linewidth=1, c ='gray')
plt.xlabel(r'$\alpha$')
plt.xscale('log')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.5),
          ncol=3, fancybox=True, shadow=True)
plt.show()

# Lasso Regression

In [None]:
#running lasso with different values of alpha
#It doesn't make sense to do lasso regression for this dataset as the dataset has only around 15 features.
from sklearn.linear_model import Lasso
x_range = [0.01, 0.1, 1, 10, 100]
train_score_list = []
test_score_list = []

for alpha in x_range: 
    lasso = Lasso(alpha)
    lasso.fit(X_train,y_train)
    train_score_list.append(lasso.score(X_train,y_train))
    test_score_list.append(lasso.score(X_test, y_test))

In [None]:
#plotting train score and test score for different values of alpha
plt.plot(x_range, train_score_list, c = 'g', label = 'Train Score')
plt.plot(x_range, test_score_list, c = 'b', label = 'Test Score')
plt.xscale('log')
plt.legend(loc = 3)
plt.xlabel(r'$\alpha$')

In [None]:
lasso = Lasso(alpha = .001)
lasso.fit(X_train,y_train)
print('Train score: {:.4f}'.format(lasso.score(X_train,y_train)))
print('Test score: {:.4f}'.format(lasso.score(X_test, y_test)))

In [None]:
#using cross-validation to find average training score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lasso, X_train,y_train, cv=5)
print("Cross-validation scores: {}".format(scores))

In [None]:
print("Average cross-validation score on training set: {:.2f}".format(scores.mean()))

In [None]:
%matplotlib inline

x_range1 = np.linspace(0.001, 1, 1000).reshape(-1,1)
x_range2 = np.linspace(1, 1000, 1000).reshape(-1,1)

x_range = np.append(x_range1, x_range2)
coeff = []

for alpha in x_range: 
    lasso = Lasso(alpha)
    lasso.fit(X_train,y_train)
    coeff.append(lasso.coef_ )
    
coeff = np.array(coeff)

for i in range(0,13):
    plt.plot(x_range, coeff[:,i], label = 'feature {:d}'.format(i))

plt.axhline(y=0, xmin=0.001, xmax=9999, linewidth=1, c ='gray')
plt.xlabel(r'$\alpha$')
plt.xscale('log')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.5),
          ncol=3, fancybox=True, shadow=True)
plt.show()

# Polynomial Regression

In [None]:
from  sklearn.preprocessing  import PolynomialFeatures
train_score_list = []
test_score_list = []

for n in range(1,3):
    poly = PolynomialFeatures(n)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    lreg.fit(X_train_poly, y_train)
    train_score_list.append(lreg.score(X_train_poly, y_train))
    test_score_list.append(lreg.score(X_test_poly, y_test))


In [None]:
from  sklearn.preprocessing  import PolynomialFeatures

train_score_list = []
test_score_list = []

for n in range(1,3):
    poly = PolynomialFeatures(n)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    p=lreg.fit(X_train_poly, y_train)
    train_score_list.append(lreg.score(X_train_poly, y_train))
    test_score_list.append(lreg.score(X_test_poly, y_test))

In [None]:
print(train_score_list)
print(test_score_list)

In [None]:
#using cross-validation to find average training score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(p, X_train,y_train, cv=5)
print("Cross-validation scores: {}".format(scores))

In [None]:
print("Average cross-validation score: {:.2f}".format(scores.mean()))

# Support Vector Regressor Machine

In [None]:
from sklearn.svm import SVR
regressor = SVR(kernel='linear',C=1)
regressor.fit(X_train,y_train)


In [None]:
#using cross-validation to find average training score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(regressor, X_train,y_train, cv=5)
print("Cross-validation scores: {}".format(scores))

In [None]:
print("Average cross-validation score on training set: {:.2f}".format(scores.mean()))

In [None]:
from sklearn.svm import SVR
rreg = SVR(kernel='rbf',C=1,gamma=.1)
rreg.fit(X_train,y_train)

In [None]:
#using cross-validation to find average training score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rreg, X_train,y_train, cv=5)
print("Cross-validation scores: {}".format(scores))

In [None]:
print("Average cross-validation score on training set: {:.2f}".format(scores.mean()))

In [None]:
from sklearn.svm import SVR
preg = SVR(kernel='poly',C=1,gamma=.1)
preg.fit(X_train,y_train)

In [None]:
#using cross-validation to find average training score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(preg, X_train,y_train, cv=5)
print("Cross-validation scores: {}".format(scores))

In [None]:
print("Average cross-validation score on training set: {:.2f}".format(scores.mean()))

# Choosing the best model 

In [None]:
#Based on accuarcy, e choose the model that gives the best results
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
X_train_org, X_test_org, y_train, y_test = train_test_split(X,y, random_state = 0)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_org)
X_test = scaler.transform(X_test_org)

In [None]:
ridge = Ridge(alpha = 1)
ridge.fit(X_train,y_train)
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

# <font color='Black'> Project 1 Classification

## Data Preprocessing 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)

In [None]:
df = pd.read_csv ('food-inspections.csv')
df.head()
#df = df[:2000]
df=df.sample(frac = 0.01016, random_state=0)

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.drop('DBA Name', axis = 1, inplace =True)
df.info()

In [None]:
df.info()
df.columns

In [None]:
df.drop('AKA Name', axis = 1, inplace =True)
df.drop('Address', axis = 1, inplace =True)
df.drop('City', axis = 1, inplace =True)
df.drop('State', axis = 1, inplace =True)
df.drop('Inspection Date', axis = 1, inplace =True)
df.drop('Inspection Type', axis = 1, inplace =True)
df.drop('Latitude', axis = 1, inplace =True)
df.drop('Longitude', axis = 1, inplace =True)
df.drop('Location', axis = 1, inplace =True)
df.drop('Historical Wards 2003-2015', axis = 1, inplace =True)
df.drop('Zip Codes', axis = 1, inplace =True)
df.drop('Community Areas', axis = 1, inplace =True)
df.drop('Census Tracts', axis = 1, inplace =True)
df.drop('Wards', axis = 1, inplace =True)
df.columns

In [None]:
df['Facility Type'].unique()

In [None]:
df.drop('Facility Type', axis = 1, inplace =True)
df

In [None]:
df['Risk'].unique()

In [None]:
df = df.dropna(axis=0, subset=['Risk'])


In [None]:
df = df.dropna(axis=0, subset=['License #'])
df = df.dropna(axis=0, subset=['Zip'])

In [None]:
df['Risk'].unique()

In [None]:
df_cols = pd.get_dummies(df['Risk'], prefix='Risk')
df[df_cols.columns] = df_cols
df.drop('Risk', axis = 1, inplace =True)
df.head()

In [None]:
df.info()

In [None]:
df['Violations'].unique()

In [None]:
df['Violations'] = df['Violations'].replace(np.nan,0)
df['Violations'] = df['Violations'].replace('-',0)

In [None]:
m = df.Violations != 0
df.Violations.where(~m,other='1')

In [None]:
df['Violation_1_0'] = m
df.head()

In [None]:
df['Violation_1_0'] = np.round(df['Violation_1_0']).astype(int)

In [None]:
df['Violation_1_0'] = df['Violation_1_0'].replace('True',1)
df['Violation_1_0'] = df['Violation_1_0'].replace('False',0)
df.head()

In [None]:
df.drop('Violations', axis = 1, inplace =True)
df

In [None]:
df['Results'].unique()

In [None]:
df['Results'] = (df['Results']).astype(str)

In [None]:
df['Results'] = df['Results'].replace('Not Ready',np.nan)
df.head()

In [None]:
df['Results'] = df['Results'].replace('Out of Business',np.nan)
df.head()

In [None]:
df['Results'] = df['Results'].replace('No Entry',np.nan)
df.head()

In [None]:
df['Results'] = df['Results'].replace('Business Not Located',np.nan)
df.head()

In [None]:
df = df.dropna(axis=0, subset=['Results'])
df.head()

In [None]:
df['Results'].unique()

In [None]:
df['Results'] = df['Results'].replace('Pass w/ Conditions',1)
df['Results'] = df['Results'].replace('Fail',0)
df['Results'] = df['Results'].replace('Pass',1)
df.head()

In [None]:
dfnp = np.asarray(df)

In [None]:
dfnpX = df.drop('Results',axis=1) 
dfnpy = df["Results"]
dfnpXx = np.asarray(dfnpX)
dfnpyy = np.asarray(dfnpy)

# Splitting the data

#MinMaxScaler(feature_range = (0, 1)) will transform each value in the column proportionally within the range [0,1]. Use this as the first scaler choice to transform a feature, as it will preserve the shape of the dataset (no distortion).

#StandardScaler() will transform each value in the column to range about the mean 0 and standard deviation 1, ie, each value will be normalised by subtracting the mean and dividing by standard deviation. Use StandardScaler if you know the data distribution is normal.

#If there are outliers, use RobustScaler(). Alternatively you could remove the outliers and use either of the above 2 scalers (choice depends on whether data is normally distributed)

#Since all the datapoints in this dataset are binary any scaler would be a good fit for our machine learning model

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from mlxtend.plotting import plot_decision_regions
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline

from mlxtend.plotting import plot_decision_regions
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
import matplotlib.gridspec as gridspec
import itertools

import numpy as np
X = df.drop('Results',axis=1)  
y = df["Results"]
X_train_org, X_test_org, y_train, y_test = train_test_split(X, y, random_state = 0)

scaler = MinMaxScaler()
scaler.fit(X_train_org) # Find the min and max of each column #Fit is for training dataset
X_train = scaler.fit_transform(X_train_org) #transforms X_train_org to X_train with all valus between 0 and 1
X_test = scaler.transform(X_test_org)

# Evaluation 

We use Grid search and Cross validation to evaluate the best parameters for this dataset 

# Grid search(Both Naive and grid search with Cross validation)

In [None]:
from sklearn.svm import SVC

X = df.drop('Results',axis=1)  
y = df["Results"]
# split data into train+validation set and test set
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, random_state=0)

# split train+validation set into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, random_state=1)

print("Size of training set: {}   size of validation set: {}   size of test set:"
      " {}\n".format(X_train.shape[0], X_valid.shape[0], X_test.shape[0]))

best_score = 0

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        # for each combination of parameters, train an SVC
        svm = SVC(gamma=gamma, C=C)
        svm.fit(X_train, y_train)
        # evaluate the SVC on the validation set
        score = svm.score(X_valid, y_valid)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}

# rebuild a model on the combined training and validation set,
# and evaluate it on the test set
svm = SVC(**best_parameters)
svm.fit(X_trainval, y_trainval)
test_score = svm.score(X_test, y_test)
print("Best score on validation set: {:.2f}".format(best_score))
print("Best parameters: ", best_parameters)
print("Test set score with best parameters: {:.2f}".format(test_score))

In [None]:
import numpy as np

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        # for each combination of parameters,
        # train an SVC
        svm = SVC(gamma=gamma, C=C)
        # perform cross-validation
        scores = cross_val_score(svm, X_trainval, y_trainval, cv=5)
        # compute mean cross-validation accuracy
        score = np.mean(scores)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}
            
# rebuild a model on the combined training and validation set
svm = SVC(**best_parameters)
svm.fit(X_trainval, y_trainval)

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
X = df.drop('Results',axis=1)  
y = df["Results"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

print("Size of training set: {}   size of test set: {}".format(X_train.shape[0], X_test.shape[0]))

best_score = 0

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        # for each combination of parameters, train an SVC
        svm = SVC(gamma=gamma, C=C)
        svm.fit(X_train, y_train)
        # evaluate the SVC on the test set
        score = svm.score(X_test, y_test)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}

print("Best score: {:.2f}".format(best_score))
print("Best parameters: {}".format(best_parameters))

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
print("Parameter grid:\n{}".format(param_grid))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
grid_search = GridSearchCV(SVC(), param_grid, cv=5, return_train_score=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

In [None]:
import pandas as pd
# convert to DataFrame
results = pd.DataFrame(grid_search.cv_results_)
# show the first 5 rows
display(results.head())

In [None]:
%matplotlib notebook
import mglearn
scores = np.array(results.mean_test_score).reshape(6, 6)

# plot the mean cross-validation scoresfrom sklearn.model_selection import train_test_split
mglearn.tools.heatmap(scores, xlabel='gamma', xticklabels=param_grid['gamma'], ylabel='C', yticklabels=param_grid['C'], cmap="viridis")

### Grid search for decision tree models

In [None]:
# Import
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
X = df.drop('Results',axis=1)  
y = df["Results"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [None]:
dtc = DecisionTreeClassifier(random_state=0)
# 2. Fit
dtc.fit(X_train, y_train)

# 3. Predict, there're 4 features in the iris dataset
y_pred_class = dtc.predict(X_test)

In [None]:
from sklearn import metrics
# Accuracy
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# 1. Instantiate with min_samples_split = 50
dtc = DecisionTreeClassifier(min_samples_split=4, random_state=0)

# 2. Fit
dtc.fit(X_train, y_train)

# 3. Predict, there're 4 features in the iris dataset
y_pred_class = dtc.predict(X_test)

# Accuracy
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
    # Import
    from sklearn.model_selection import GridSearchCV
    import matplotlib.gridspec as gridspec

    # Define the parameter values that should be searched
    sample_split_range = list(range(2, 50))

    # Create a parameter grid: map the parameter names to the values that should be searched
    # Simply a python dictionary
    # Key: parameter name
    # Value: list of values that should be searched for that parameter
    # Single key-value pair for param_grid
    param_grid = dict(min_samples_split=sample_split_range)
    

    # instantiate the grid
    grid = GridSearchCV(dtc, param_grid, cv=10, scoring='accuracy')

    # fit the grid with data
    grid.fit(X_train, y_train)

In [None]:
# examine the best model

# Single best score achieved across all params (min_samples_split)
print( grid.best_score_)

# Dictionary containing the parameters (min_samples_split) used to generate that score
print(grid.best_params_)

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(grid.best_estimator_)

### Cross Validation for logistic regression

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from mlxtend.plotting import plot_decision_regions
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.gridspec as gridspec
import itertools
import numpy as np

X = df.drop('Results',axis=1)  
y = df["Results"]
X_train_org, X_test_org, y_train, y_test = train_test_split(X, y, random_state = 0)

scaler = StandardScaler()
scaler.fit(X_train_org)
X_train = scaler.transform(X_train_org)
X_test = scaler.transform(X_test_org)

logreg = LogisticRegression()

scores = cross_val_score(logreg, X_train, y_train,cv=10)
print("Cross-validation scores: {}".format(scores))

In [None]:
print("Average cross-validation score: {:.2f}".format(scores.mean()))

### Cross Validation for K Neighbors Classification

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

knn = KNeighborsClassifier()


scores = cross_val_score(knn, X_train, y_train,cv=10)
print("Cross-validation scores: {}".format(scores))

In [None]:
print("Average cross-validation score: {:.2f}".format(scores.mean()))

### Cross Validation for Linear SVC

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


logreg = LogisticRegression()
knn = KNeighborsClassifier()
clf = LinearSVC()

scores = cross_val_score(clf, X_train, y_train,cv=10)
print("Cross-validation scores: {}".format(scores))

In [None]:
print("Average cross-validation score: {:.2f}".format(scores.mean()))

### Cross Validation for Linear kernel SVC

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

clf = SVC(kernel='linear',C = 0.001)

scores = cross_val_score(clf, X_train, y_train,cv=15)
print("Cross-validation scores: {}".format(scores))

In [None]:
print("Average cross-validation score: {:.4f}".format(scores.mean()))

### Cross Validation for rbf kernel SVC

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

clf = SVC(kernel ='rbf',C = 0.001,gamma = 0.001)

scores = cross_val_score(clf, X_train, y_train,cv=15)
print("Cross-validation scores: {}".format(scores))

In [None]:
print("Average cross-validation score: {:.4f}".format(scores.mean()))

### Cross Validation for poly kernel SVC

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

clf = SVC(kernel ='poly', C = 0.001, gamma = 0.001, degree = 3, coef0=0.0)
                
scores = cross_val_score(clf, X_train, y_train,cv=10)
print("Cross-validation scores: {}".format(scores))

In [None]:
print("Average cross-validation score: {:.4f}".format(scores.mean()))

### Cross Validation for Decision tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

dtc = DecisionTreeClassifier(min_samples_split=49, random_state=0)
dtc.fit(X_train, y_train)

scores = cross_val_score(dtc, X_train, y_train,cv=10)
print("Cross-validation scores: {}".format(scores))

In [None]:
print("Average cross-validation score: {:.4f}".format(scores.mean()))

## K nearest neighbor Classifier 

In [None]:
%matplotlib inline

from mlxtend.plotting import plot_decision_regions
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

X = df.drop('Results',axis=1)  
y = df["Results"]

In [None]:
X_train_org, X_test_org, y_train, y_test = train_test_split(X, y, random_state = 0)

scaler = MinMaxScaler()
scaler.fit(X_train_org) # Find the min and max of each column #Fit is for training dataset
X_train = scaler.fit_transform(X_train_org) #transforms X_train_org to X_train with all valus between 0 and 1
# = scaler.fit_transform(X_train_org)
X_test = scaler.transform(X_test_org)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

train_score_array = []
test_score_array = []

for k in range(1,20):
    knn = KNeighborsClassifier(k)
    knn.fit(X_train, y_train)
    train_score_array.append(knn.score(X_train, y_train))
    test_score_array.append(knn.score(X_test, y_test))

In [None]:
x_axis = range(1,20)
%matplotlib inline
plt.plot(x_axis, train_score_array, label = 'Train Score', c = 'g')
plt.plot(x_axis, test_score_array, label = 'Test Score', c='b')
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.legend()

It seems k = 6 is the best parameter for knn model. 

In [None]:
knn = KNeighborsClassifier(6)
knn.fit(X_train, y_train)
print('Train score: {:.5f}'.format(knn.score(X_train, y_train)))
print('Test score: {:.5f}'.format(knn.score(X_test, y_test)))

## Logistic Regression

In [None]:
df.to_numpy()

In [None]:
from sklearn.linear_model import LogisticRegression

c_range = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
train_score_l1 = []
train_score_l2 = []
test_score_l1 = []
test_score_l2 = []

for c in c_range:
    log_l1 = LogisticRegression(penalty = 'l1', C = c)
    log_l2 = LogisticRegression(penalty = 'l2', C = c)
    log_l1.fit(X_train, y_train)
    log_l2.fit(X_train, y_train)
    train_score_l1.append(log_l1.score(X_train, y_train))
    train_score_l2.append(log_l2.score(X_train, y_train))
    test_score_l1.append(log_l1.score(X_test, y_test))
    test_score_l2.append(log_l2.score(X_test, y_test))

In [None]:
%matplotlib inline

plt.plot(c_range, train_score_l1, label = 'Train score, penalty= l1')
plt.plot(c_range, test_score_l1, label = 'Test score, penalty = l1')
plt.plot(c_range, train_score_l2, label = 'Train score, penalty = l2')
plt.plot(c_range, test_score_l2, label = 'Test score, penalty = l2')
plt.legend()
plt.xlabel('Regularization parameter: C')
plt.ylabel('Accuracy')
plt.xscale('log')

It seems C = 10^-2 is the best parameter for Logistic model. 

In [None]:
%matplotlib inline

from mlxtend.plotting import plot_decision_regions

X = df.drop('Results',axis=1)  
y = df["Results"]
X_train_org, X_test_org, y_train, y_test = train_test_split(X, y, random_state = 1)

scaler = MinMaxScaler()
scaler.fit(X_train_org) # Find the min and max of each column #Fit is for training dataset
X_train = scaler.fit_transform(X_train_org) #transforms X_train_org to X_train with all valus between 0 and 1
# = scaler.fit_transform(X_train_org)
X_test = scaler.transform(X_test_org)

lreg = LogisticRegression(C=0.01)
lreg.fit(X_train, y_train) 

print('Train score: {:.4f}'.format(lreg.score(X_train, y_train)))
print('Test score: {:.4f}'.format(lreg.score(X_test, y_test)))


## Linear Support Vector Machine

In [None]:
%matplotlib inline

from mlxtend.plotting import plot_decision_regions
from sklearn.svm import LinearSVC
import numpy as np
random_state = 0
X = df.drop('Results',axis=1)  
y = df["Results"]


In [None]:
X.info()

In [None]:

clf = LinearSVC()
clf.fit(X_train,y_train)

print('Train score: {:.4f}'.format(clf.score(X_train, y_train)))
print('Test score: {:.4f}'.format(clf.score(X_test, y_test)))


## SVC with kernel trick

In [None]:
%matplotlib inline

from mlxtend.plotting import plot_decision_regions
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
import matplotlib.gridspec as gridspec
import itertools
import numpy as np
X = df.drop('Results',axis=1)  
y = df["Results"]
X_train_org, X_test_org, y_train, y_test = train_test_split(X, y, random_state = 0)

scaler = MinMaxScaler()
scaler.fit(X_train_org) # Find the min and max of each column #Fit is for training dataset
X_train = scaler.fit_transform(X_train_org) #transforms X_train_org to X_train with all valus between 0 and 1
# = scaler.fit_transform(X_train_org)
X_test = scaler.transform(X_test_org)

clf2 = SVC(kernel='linear', C=0.001)
clf2.fit(X_train, y_train)
print('Train score: {:.4f}'.format(clf2.score(X_train, y_train)))
print('Test score: {:.4f}'.format(clf2.score(X_test, y_test)))


In [None]:
%matplotlib inline

from mlxtend.plotting import plot_decision_regions
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
import matplotlib.gridspec as gridspec
import itertools
import numpy as np

C = 1
clf3 = SVC(kernel='rbf', gamma=0.001, C=0.001)
clf3.fit(X_train, y_train)


print('Train score: {:.4f}'.format(clf3.score(X_train, y_train)))
print('Test score: {:.4f}'.format(clf3.score(X_test, y_test)))


In [None]:
%matplotlib inline

from mlxtend.plotting import plot_decision_regions
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
import matplotlib.gridspec as gridspec
import itertools
import numpy as np

C = 1
clf = SVC(kernel='poly',degree=3, gamma=0.001, C=0.001)
clf.fit(X_train, y_train)


print('Train score: {:.4f}'.format(clf.score(X_train, y_train)))
print('Test score: {:.4f}'.format(clf.score(X_test, y_test)))


## Decision tree Classifier

In [None]:
%matplotlib notebook
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

X = df.drop('Results',axis=1)  
y = df["Results"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

dtree = DecisionTreeClassifier(min_samples_split = 49,max_depth=None,random_state=0)

dtree.fit(X_train, y_train)
print("Accuracy on training set: {:.4f}".format(dtree.score(X_train, y_train)))
print("Accuracy on test set: {:.4f}".format(dtree.score(X_test, y_test)))

We see that the Decision tree classifier and K neighbour Classifier give the best result for this dataset 

### <font color='Green'>K neighbour classification
 Train score: 0.79036
 Test score: 0.76376
### <font color='blue'>Logistic Regression
 Train score: 0.7827
 Test score: 0.7798
### <font color='blue'>Linear SVC
 Train score: 0.7827
 Test score: 0.7798
### <font color='blue'>Kernel linear SVC
 Train score: 0.7766
 Test score: 0.7982
### <font color='blue'>Kernel rbf SVC
 Train score: 0.7766
 Test score: 0.7982
### <font color='blue'>Kernel Poly SVC
 Train score: 0.7766
 Test score: 0.7982
### <font color='Green'>Decision Tree Classifier:
 Accuracy on training set: 0.8103
 Accuracy on test set: 0.7683