# Modelling

## Import Libraries

In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import figure
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from tabulate import tabulate

## Import the sampled csv data

In [71]:
df_og= pd.read_csv('Data/data_og.csv')  
df_nm1= pd.read_csv('Data/data_nm1.csv')  
df_nm2= pd.read_csv('Data/data_nm2.csv')  
df_nm3= pd.read_csv('Data/data_nm3.csv')  
df_rus= pd.read_csv('Data/data_rus.csv')  
df_ros= pd.read_csv('Data/data_ros.csv')  
df_smote= pd.read_csv('Data/data_smote.csv')  
df_smoteen= pd.read_csv('Data/data_smoteen.csv')  

## Models

### Logisitic Regression

In [72]:
def modellr(df_t,m):
    X = df_t.iloc[:, :-1]
    y = df_t.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    #Feature Scaling
    sc = StandardScaler()
    X_train.loc[:,:] = sc.fit_transform(X_train.loc[:,:])
    X_test.loc[:,:] = sc.transform(X_test.loc[:,:])

    #Training the model
    classifier = LogisticRegression(random_state = 0)
    classifier.fit(X_train, y_train)

    #Predict
    y_pred = classifier.predict(X_test)

    from sklearn.model_selection import cross_val_score
    accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
    
    #Metrics
    cm = confusion_matrix(y_test, y_pred)
    disp.append([m,round(accuracy_score(y_test, y_pred)*100,2),round(precision_score(y_test, y_pred,zero_division=0),2),round(recall_score(y_test, y_pred),2),(round(f1_score(y_test, y_pred),2)),round(accuracies.mean()*100,2),round(accuracies.mean()*100,2)])
    allmodels.append([m,round(accuracy_score(y_test, y_pred)*100,2),round(precision_score(y_test, y_pred,zero_division=0),2),round(recall_score(y_test, y_pred),2),(round(f1_score(y_test, y_pred),2)),round(accuracies.mean()*100,2),round(accuracies.mean()*100,2)])
    

In [73]:
#Splitting the dataset into the Training set and Test set
#data_sample_set = ['df_og','df_nm1','df_nm2','df_nm3','df_rus','df_ros','df_smote','df_smoteen']
data_sample_set = [df_og,df_nm1,df_nm2,df_nm3,df_rus,df_ros,df_smote,df_smoteen]
names = ['LogisticRegression Original','LogisticRegression Near Miss1', 'LogisticRegression Near Miss2','LogisticRegression Near Miss3','LogisticRegression Random UnderSampling','LogisticRegression Random Sampling','LogisticRegression Smote','LogisticRegression Smoteen']

allmodels = []

disp = []

for i in range(0,8):
    modellr(data_sample_set[i],names[i])

In [74]:
print("Model used : Logistic Regression\n")
print(tabulate(disp, headers=["Sampling Technique", "Accuracy", "Precision","Recall","F1 Score","CV Score"]))

Model used : Logistic Regression

                                           Sampling Technique    Accuracy    Precision    Recall    F1 Score    CV Score
---------------------------------------  --------------------  ----------  -----------  --------  ----------  ----------
LogisticRegression Original                             98.69        0            0         0          98.49       98.49
LogisticRegression Near Miss1                           67.24        0.7          0.62      0.66       65.97       65.97
LogisticRegression Near Miss2                           79.31        0.87         0.7       0.77       77.91       77.91
LogisticRegression Near Miss3                           62.07        0.62         0.64      0.63       65.47       65.47
LogisticRegression Random UnderSampling                 71.98        0.75         0.67      0.71       75.47       75.47
LogisticRegression Random Sampling                      76.15        0.78         0.73      0.75       76.38       76.3

### KNN

In [75]:
def modelknn(df_t,m):
    X = df_t.iloc[:, :-1]
    y = df_t.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    #Feature Scaling
    sc = StandardScaler()
    X_train.loc[:,:] = sc.fit_transform(X_train.loc[:,:])
    X_test.loc[:,:] = sc.transform(X_test.loc[:,:])

    #Training the model
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, y_train)

    #Predict
    y_pred = classifier.predict(X_test)

    accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
    
    #Metrics
    cm = confusion_matrix(y_test, y_pred)
    
    disp.append([m,round(accuracy_score(y_test, y_pred)*100,2),round(precision_score(y_test, y_pred,zero_division=0),2),round(recall_score(y_test, y_pred),2),(round(f1_score(y_test, y_pred),2)),round(accuracies.mean()*100,2)])
    allmodels.append([m,round(accuracy_score(y_test, y_pred)*100,2),round(precision_score(y_test, y_pred,zero_division=0),2),round(recall_score(y_test, y_pred),2),(round(f1_score(y_test, y_pred),2)),round(accuracies.mean()*100,2)])
    
#Splitting the dataset into the Training set and Test set
#data_sample_set = ['df_og','df_nm1','df_nm2','df_nm3','df_rus','df_ros','df_smote','df_smoteen']
data_sample_set = [df_og,df_nm1,df_nm2,df_nm3,df_rus,df_ros,df_smote,df_smoteen]
names = ['KNN Original','KNN Near Miss1', 'KNN Near Miss2','KNN Near Miss3','KNN Random UnderSampling','KNN Random Sampling','KNN Smote','KNN Smoteen' ]

disp = []

for i in range(0,8):
    modelknn(data_sample_set[i],names[i])
    

In [76]:
print("Model used : KNN\n")
print(tabulate(disp, headers=["Sampling Technique", "Accuracy", "Precision","Recall","F1 Score","CV Score"]))

Model used : KNN

Sampling Technique          Accuracy    Precision    Recall    F1 Score    CV Score
------------------------  ----------  -----------  --------  ----------  ----------
KNN Original                   98.64         0         0           0          98.41
KNN Near Miss1                 76.72         0.84      0.67        0.75       77.12
KNN Near Miss2                 92.24         0.99      0.86        0.92       88.2
KNN Near Miss3                 65.3          0.69      0.57        0.63       66.19
KNN Random UnderSampling       70.26         0.71      0.7         0.71       73.09
KNN Random Sampling            97.25         0.95      1           0.97       97.12
KNN Smote                      94.94         0.92      0.98        0.95       95.07
KNN Smoteen                    97.31         0.96      0.99        0.98       97.11


### Decision Tree

In [77]:
def modeldt(df_t,m):
    X = df_t.iloc[:, :-1]
    y = df_t.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    #Feature Scaling
    sc = StandardScaler()
    X_train.loc[:,:] = sc.fit_transform(X_train.loc[:,:])
    X_test.loc[:,:] = sc.transform(X_test.loc[:,:])

    #Training the model
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train,y_train)

    #Predict
    y_pred = classifier.predict(X_test)

    accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
    
    #Metrics
    cm = confusion_matrix(y_test, y_pred)
    
    disp.append([m,round(accuracy_score(y_test, y_pred)*100,2),round(precision_score(y_test, y_pred,zero_division=0),2),round(recall_score(y_test, y_pred),2),(round(f1_score(y_test, y_pred),2)),round(accuracies.mean()*100,2)])
    allmodels.append([m,round(accuracy_score(y_test, y_pred)*100,2),round(precision_score(y_test, y_pred,zero_division=0),2),round(recall_score(y_test, y_pred),2),(round(f1_score(y_test, y_pred),2)),round(accuracies.mean()*100,2)])
    
#Splitting the dataset into the Training set and Test set
#data_sample_set = ['df_og','df_nm1','df_nm2','df_nm3','df_rus','df_ros','df_smote','df_smoteen']
data_sample_set = [df_og,df_nm1,df_nm2,df_nm3,df_rus,df_ros,df_smote,df_smoteen]
names = ['DecisionTree Original','DecisionTree Near Miss1', 'DecisionTree Near Miss2','DecisionTree Near Miss3','DecisionTree Random UnderSampling','DecisionTree Random Sampling','DecisionTree Smote','DecisionTree Smoteen' ]

disp = []

for i in range(0,8):
    modeldt(data_sample_set[i],names[i])
    

In [78]:
print("Model used : Decision Tree\n")
print(tabulate(disp, headers=["Sampling Technique", "Accuracy", "Precision","Recall","F1 Score","CV Score"]))

Model used : Decision Tree

Sampling Technique                   Accuracy    Precision    Recall    F1 Score    CV Score
---------------------------------  ----------  -----------  --------  ----------  ----------
DecisionTree Original                   97.3          0.06      0.07        0.06       97.07
DecisionTree Near Miss1                 79.31         0.8       0.79        0.79       77.99
DecisionTree Near Miss2                 93.53         0.94      0.94        0.94       91.87
DecisionTree Near Miss3                 62.28         0.64      0.59        0.61       64.1
DecisionTree Random UnderSampling       62.72         0.63      0.63        0.63       66.26
DecisionTree Random Sampling            98.91         0.98      1           0.99       98.79
DecisionTree Smote                      96.89         0.96      0.98        0.97       96.7
DecisionTree Smoteen                    98.48         0.98      0.99        0.99       98.38


### Naive Bayes

In [79]:
def modelNB(df_t,m):
    X = df_t.iloc[:, :-1]
    y = df_t.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    #Feature Scaling
    sc = StandardScaler()
    X_train.loc[:,:] = sc.fit_transform(X_train.loc[:,:])
    X_test.loc[:,:] = sc.transform(X_test.loc[:,:])

    #Training the model
    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(X_train,y_train)

    #Predict
    y_pred = classifier.predict(X_test)

    accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
    
    #Metrics
    cm = confusion_matrix(y_test, y_pred)
    
    disp.append([m,round(accuracy_score(y_test, y_pred)*100,2),round(precision_score(y_test, y_pred,zero_division=0),2),round(recall_score(y_test, y_pred),2),(round(f1_score(y_test, y_pred),2)),round(accuracies.mean()*100,2)])
    allmodels.append([m,round(accuracy_score(y_test, y_pred)*100,2),round(precision_score(y_test, y_pred,zero_division=0),2),round(recall_score(y_test, y_pred),2),(round(f1_score(y_test, y_pred),2)),round(accuracies.mean()*100,2)])
    
#Splitting the dataset into the Training set and Test set
#data_sample_set = ['df_og','df_nm1','df_nm2','df_nm3','df_rus','df_ros','df_smote','df_smoteen']
data_sample_set = [df_og,df_nm1,df_nm2,df_nm3,df_rus,df_ros,df_smote,df_smoteen]
names = ['NaiveBayes Original','NaiveBayes Near Miss1', 'NaiveBayes Near Miss2','NaiveBayes Near Miss3','NaiveBayes Random UnderSampling','NaiveBayes Random Sampling','NaiveBayes Smote','NaiveBayes Smoteen']

disp = []

for i in range(0,8):
    modelNB(data_sample_set[i],names[i])
    

In [80]:
print("Model used : NaiveBayes\n")
print(tabulate(disp, headers=["Sampling Technique", "Accuracy", "Precision","Recall","F1 Score","CV Score"]))

Model used : NaiveBayes

Sampling Technique                 Accuracy    Precision    Recall    F1 Score    CV Score
-------------------------------  ----------  -----------  --------  ----------  ----------
NaiveBayes Original                   82.88         0.05      0.62        0.09       82.42
NaiveBayes Near Miss1                 58.19         0.92      0.19        0.32       57.77
NaiveBayes Near Miss2                 57.11         0.95      0.16        0.28       56.98
NaiveBayes Near Miss3                 59.27         0.58      0.7         0.63       62.01
NaiveBayes Random UnderSampling       69.18         0.71      0.66        0.68       71.58
NaiveBayes Random Sampling            71.85         0.72      0.72        0.72       71.83
NaiveBayes Smote                      84.3          0.78      0.95        0.86       84.37
NaiveBayes Smoteen                    87.34         0.84      0.94        0.89       87.37


### Random Forest

In [81]:
def modelRF(df_t,m):
    X = df_t.iloc[:, :-1]
    y = df_t.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    #Feature Scaling
    sc = StandardScaler()
    X_train.loc[:,:] = sc.fit_transform(X_train.loc[:,:])
    X_test.loc[:,:] = sc.transform(X_test.loc[:,:])

    #Training the model
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)

    #Predict
    y_pred = classifier.predict(X_test)

    from sklearn.model_selection import cross_val_score
    accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
    
    #Metrics
    cm = confusion_matrix(y_test, y_pred)
    
    accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
    
    disp.append([m,round(accuracy_score(y_test, y_pred)*100,2),round(precision_score(y_test, y_pred,zero_division=0),2),round(recall_score(y_test, y_pred),2),(round(f1_score(y_test, y_pred),2)),round(accuracies.mean()*100,2)])
    allmodels.append([m,round(accuracy_score(y_test, y_pred)*100,2),round(precision_score(y_test, y_pred,zero_division=0),2),round(recall_score(y_test, y_pred),2),(round(f1_score(y_test, y_pred),2)),round(accuracies.mean()*100,2)])
    
#Splitting the dataset into the Training set and Test set
#data_sample_set = ['df_og','df_nm1','df_nm2','df_nm3','df_rus','df_ros','df_smote','df_smoteen']
data_sample_set = [df_og,df_nm1,df_nm2,df_nm3,df_rus,df_ros,df_smote,df_smoteen]
names = ['RandomForest Original','RandomForest Near Miss1', 'RandomForest Near Miss2','RandomForest Near Miss3','RandomForest Random UnderSampling','RandomForest Random Sampling','RandomForest Smote','RandomForest Smoteen']

disp = []

for i in range(0,8):
    modelRF(data_sample_set[i],names[i])
    

In [82]:
print("Model used : RandomForest\n")
print(tabulate(disp, headers=["Sampling Technique", "Accuracy", "Precision","Recall","F1 Score","CV Score"]))

Model used : RandomForest

Sampling Technique                   Accuracy    Precision    Recall    F1 Score    CV Score
---------------------------------  ----------  -----------  --------  ----------  ----------
RandomForest Original                   98.38         0.07      0.02        0.03       98.19
RandomForest Near Miss1                 81.47         0.84      0.79        0.81       79.64
RandomForest Near Miss2                 93.1          0.95      0.91        0.93       92.52
RandomForest Near Miss3                 66.38         0.71      0.57        0.63       69.5
RandomForest Random UnderSampling       70.04         0.72      0.67        0.69       72.52
RandomForest Random Sampling            99.38         0.99      1           0.99       99.31
RandomForest Smote                      97.37         0.97      0.98        0.97       97.34
RandomForest Smoteen                    98.92         0.99      0.99        0.99       98.8


### SVM

### Kernel SVM

### XGBoost

In [None]:
def modelXG(df_t,m):
    X = df_t.iloc[:, :-1]
    y = df_t.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    #Feature Scaling
    sc = StandardScaler()
    X_train.loc[:,:] = sc.fit_transform(X_train.loc[:,:])
    X_test.loc[:,:] = sc.transform(X_test.loc[:,:])

    #Training the model
    from xgboost import XGBClassifier
    classifier = XGBClassifier()
    classifier.fit(X_train, y_train)
    #Predict
    y_pred = classifier.predict(X_test)

    accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
    
    #Metrics
    cm = confusion_matrix(y_test, y_pred)
    
    disp.append([m,round(accuracy_score(y_test, y_pred)*100,2),round(precision_score(y_test, y_pred,zero_division=0),2),round(recall_score(y_test, y_pred),2),(round(f1_score(y_test, y_pred),2)),round(accuracies.mean()*100,2)])
    allmodels.append([m,round(accuracy_score(y_test, y_pred)*100,2),round(precision_score(y_test, y_pred,zero_division=0),2),round(recall_score(y_test, y_pred),2),(round(f1_score(y_test, y_pred),2)),round(accuracies.mean()*100,2)])
    
#Splitting the dataset into the Training set and Test set
#data_sample_set = ['df_og','df_nm1','df_nm2','df_nm3','df_rus','df_ros','df_smote','df_smoteen']
data_sample_set = [df_og,df_nm1,df_nm2,df_nm3,df_rus,df_ros,df_smote,df_smoteen]
names = ['XGBoost Original','XGBoost Near Miss1', 'XGBoost Near Miss2','XGBoost Near Miss3','XGBoost Random UnderSampling','XGBoost Random Sampling','XGBoost Smote','XGBoost Smoteen' ]

disp = []

for i in range(0,8):
    modelXG(data_sample_set[i],names[i])













































































































In [None]:
print("Model used : XGBoost\n")
print(tabulate(disp, headers=["Sampling Technique", "Accuracy", "Precision","Recall","F1 Score","CV Score"]))

### CatBoost

In [None]:
def modelCB(df_t,m):
    X = df_t.iloc[:, :-1]
    y = df_t.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    #Feature Scaling
    sc = StandardScaler()
    X_train.loc[:,:] = sc.fit_transform(X_train.loc[:,:])
    X_test.loc[:,:] = sc.transform(X_test.loc[:,:])

    #Training the model
    from catboost import CatBoostClassifier
    classifier = CatBoostClassifier()
    classifier.fit(X_train, y_train)
    
    #Predict
    y_pred = classifier.predict(X_test)

    #Metrics
    cm = confusion_matrix(y_test, y_pred)
    
    accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
    
    disp.append([m,round(accuracy_score(y_test, y_pred)*100,2),round(precision_score(y_test, y_pred,zero_division=0),2),round(recall_score(y_test, y_pred),2),(round(f1_score(y_test, y_pred),2)),round(accuracies.mean()*100,2)])

    allmodels.append([m,round(accuracy_score(y_test, y_pred)*100,2),round(precision_score(y_test, y_pred,zero_division=0),2),round(recall_score(y_test, y_pred),2),(round(f1_score(y_test, y_pred),2)),round(accuracies.mean()*100,2)])

#Splitting the dataset into the Training set and Test set
#data_sample_set = ['df_og','df_nm1','df_nm2','df_nm3','df_rus','df_ros','df_smote','df_smoteen']
data_sample_set = [df_og,df_nm1,df_nm2,df_nm3,df_rus,df_ros,df_smote,df_smoteen]
names = ['CatBoost Original','CatBoost Near Miss1', 'CatBoost Near Miss2','CatBoost Near Miss3','CatBoost Random UnderSampling','CatBoost Random Sampling','CatBoost Smote','CatBoost Smoteen' ]

disp = []

for i in range(0,8):
    modelCB(data_sample_set[i],names[i])

In [None]:
print("Model used : CatBoost\n")
print(tabulate(disp, headers=["Sampling Technique", "Accuracy", "Precision","Recall","F1 Score","CV Score"]))

## Comparing the performance

In [None]:
allmodels

In [None]:
from operator import itemgetter
modelsSortedF1Score = sorted(allmodels, key=itemgetter(-1), reverse=True)

In [None]:
print('Top 10 models\n')
print(tabulate(modelsSortedF1Score[0:10], headers=["Sampling Technique", "Accuracy", "Precision","Recall","F1 Score","CV Score"]))

**Best Models without Crossvalidation:**

RandomForest Random Over Sampling 

RandomForest Smoteen

DecisionTree Random Over Sampling

DecisionTree Smoteen

CatBoost Smoteen

**Best Models with Crossvalidation:**






## Export the tested model to a pickle file

In [None]:
import pickle
with open('banglore_home_prices_model.pickle','wb') as f:
    pickle.dump(lr_clf,f)

### Export location and column information to a file that will be useful later on in our prediction application

In [None]:
import json
columns = {
'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))