# Data Cleaning

In [1]:
# Importing Librareis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

data = train.append(test, ignore_index=True, sort=False)

In [3]:
data.drop(['Roof(Area)','id','ANB'], axis=1, inplace=True)

In [4]:
for column in ['Troom', 'Nbedrooms', 'Nbwashrooms', 'Twashrooms']:
    data[column].fillna(data[column].mode()[0], inplace=True)
    
data['Lawn(Area)']=data['Lawn(Area)'].fillna(round(data['Lawn(Area)'].mean()))
data['API']=data['API'].fillna(round(data['API'].mean()))

In [5]:
data.at[data['roof']=='NO','roof']=0
data.at[data['roof']=='YES','roof']=1
data.at[data['roof']=='yes','roof']=1
data.at[data['roof']=='no','roof']=0

data['EXPECTED'] =data['EXPECTED'].replace({'\$':''}, regex = True)

In [6]:
train = data.iloc[:7000]
test = data.iloc[7000:]

In [7]:
test.drop(['Grade'], axis=1, inplace=True)

In [8]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=1)
test=imputer.fit_transform(test)
test=pd.DataFrame(test)

In [9]:
X=train[['Area(total)','Troom', 'Nbedrooms', 'Nbwashrooms','Twashrooms', 'roof', 'Lawn(Area)', 'Nfloors', 
         'API', 'EXPECTED']]
y=train['Grade']

In [10]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=1)
X=imputer.fit_transform(X)
X=pd.DataFrame(X)

In [11]:
test.columns = ['Area(total)','Troom', 'Nbedrooms','Nbwashrooms', 'Twashrooms', 'roof',
                    'Lawn(Area)', 'Nfloors', 'API', 'EXPECTED']

In [12]:
X.columns = ['Area(total)','Troom', 'Nbedrooms','Nbwashrooms', 'Twashrooms', 'roof',
                    'Lawn(Area)', 'Nfloors', 'API', 'EXPECTED']

# Data Spliting

In [15]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# KNN Model

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score

clf_knn=KNeighborsClassifier(n_neighbors=70)
clf_knn.fit(x_train,y_train)
knn_pred=clf_knn.predict(x_test)

cfm_knn=confusion_matrix(y_test,knn_pred)
print(cfm_knn)
Accuracy_knn_train = clf_knn.score(x_train,y_train)
print("knn train score:-",Accuracy_knn_train)
Accuracy_knn_test = accuracy_score(y_test,knn_pred)
print("knn test score:-",Accuracy_knn_test)
cv_knn = cross_val_score(clf_knn, x_train, y_train, cv=5)
print("knn cross val score:-",round(cv_knn.mean(), 2) * 100)
print(classification_report(y_test,knn_pred))
knn_pred

[[ 39  56   6   0   0]
 [ 20 144 131   0   0]
 [  0  46 445  61   0]
 [  0   0 194 250   0]
 [  0   0   0   8   0]]
knn train score:- 0.6773214285714285
knn test score:- 0.6271428571428571
knn cross val score:- 66.0
              precision    recall  f1-score   support

           A       0.66      0.39      0.49       101
           B       0.59      0.49      0.53       295
           C       0.57      0.81      0.67       552
           D       0.78      0.56      0.66       444
           E       0.00      0.00      0.00         8

    accuracy                           0.63      1400
   macro avg       0.52      0.45      0.47      1400
weighted avg       0.65      0.63      0.62      1400



array(['B', 'C', 'B', ..., 'D', 'A', 'C'], dtype=object)

In [17]:
parameters=[{'n_neighbors':[10,20,30,40,50,60,70,80,90]}]

grid_search= GridSearchCV(estimator= clf_knn,param_grid=parameters,scoring='accuracy',cv=5,n_jobs= -1)

grid_search=grid_search.fit(x_train, y_train)
best_accuracy_knn=grid_search.best_score_
best_parameters_knn=grid_search.best_params_

In [18]:
best_accuracy_knn

0.665

In [19]:
best_parameters_knn

{'n_neighbors': 70}

# Random Forest Model

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score

clf_rf=RandomForestClassifier(criterion='entropy',random_state = 0,n_estimators = 90)
clf_rf.fit(x_train,y_train)
rf_pred=clf_rf.predict(x_test)

cfm_rf=confusion_matrix(y_test,rf_pred)
print(cfm_rf)
Accuracy_rf_train = clf_rf.score(x_train,y_train)
print("rf train score:-",Accuracy_rf_train)
Accuracy_rf_test = accuracy_score(y_test,rf_pred)
print("rf test score:-",Accuracy_rf_test)
cv_rf = cross_val_score(clf_rf, x_train, y_train, cv=5)
print("rf cross val score:-",round(cv_rf.mean(), 2) * 100)
print(classification_report(y_test,rf_pred))
rf_pred

[[ 71  30   0   0   0]
 [  8 250  37   0   0]
 [  0  26 489  37   0]
 [  0   0  38 406   0]
 [  0   0   0   6   2]]
rf train score:- 1.0
rf test score:- 0.87
rf cross val score:- 87.0
              precision    recall  f1-score   support

           A       0.90      0.70      0.79       101
           B       0.82      0.85      0.83       295
           C       0.87      0.89      0.88       552
           D       0.90      0.91      0.91       444
           E       1.00      0.25      0.40         8

    accuracy                           0.87      1400
   macro avg       0.90      0.72      0.76      1400
weighted avg       0.87      0.87      0.87      1400



array(['B', 'C', 'B', ..., 'C', 'A', 'D'], dtype=object)

In [30]:
parameters=[{'n_estimators' : [10,20,30,40,50,60,70,80,90], 'criterion' : ['entropy'],'random_state':[0,1,2]},
           {'n_estimators' : [10,20,30,40,50,60,70,80,90], 'criterion' : ['gini'],'random_state':[0,1,2]}]

grid_search= GridSearchCV(estimator= classifier_rf,param_grid=parameters,scoring='accuracy',cv=5,n_jobs= -1)

grid_search=grid_search.fit(x_train,y_train)
best_accuracy_rf=grid_search.best_score_
best_parameters_rf=grid_search.best_params_

In [31]:
best_accuracy_rf

0.8719642857142856

In [32]:
best_parameters_rf

{'criterion': 'entropy', 'n_estimators': 90, 'random_state': 0}

# SVM Model

In [21]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score

clf_svc=SVC(gamma=0.1)
clf_svc.fit(x_train,y_train)
svc_pred=clf_svc.predict(x_test)

cfm_svc=confusion_matrix(y_test,svc_pred)
print(cfm_svc)
Accuracy_svc_train = clf_svc.score(x_train,y_train)
print("svc train score:-",Accuracy_svc_train)
Accuracy_svc_test = accuracy_score(y_test,svc_pred)
print("svc test score:-",Accuracy_svc_test)
cv_svc = cross_val_score(clf_svc, x_train, y_train, cv=5)
print("svc cross val score:-",round(cv_svc.mean(), 2) * 100)
print(classification_report(y_test,svc_pred))
svc_pred

[[  0   0 101   0   0]
 [  0   0 295   0   0]
 [  0   0 552   0   0]
 [  0   0 443   1   0]
 [  0   0   8   0   0]]
svc train score:- 0.9998214285714285
svc test score:- 0.395
svc cross val score:- 42.0
              precision    recall  f1-score   support

           A       0.00      0.00      0.00       101
           B       0.00      0.00      0.00       295
           C       0.39      1.00      0.57       552
           D       1.00      0.00      0.00       444
           E       0.00      0.00      0.00         8

    accuracy                           0.40      1400
   macro avg       0.28      0.20      0.11      1400
weighted avg       0.47      0.40      0.22      1400



array(['C', 'C', 'C', ..., 'C', 'C', 'C'], dtype=object)

### Running model on test data and generating predictions with RFC Non Generalized Model as it is giving the better accuracy precision

In [22]:
y_pred = clf_rf.predict(test)

In [23]:
test["Grade"]=y_pred
test["EXPECTED"]=test["EXPECTED"].astype(str)+"$"

In [25]:
test.to_csv('Prediction.csv')