## Apply Machine Learning Algo to the dataset

In [1]:
#load important libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

pd.pandas.set_option('display.max_rows',None)
pd.pandas.set_option('display.max_columns',None)

from sklearn.metrics import accuracy_score


In [2]:
#import EDA files 
df = pd.read_csv('Train_cleaned.csv')
df_test = pd.read_csv('Test_cleaned.csv')

In [3]:
#import original test file for storing appno which will be needed in submission file of the competition
dx = pd.read_csv('test.csv')
app_no_testdata = dx['appno']

In [4]:
#Shape of the data
print(df.shape)
print(df_test.shape)

(8878, 384)
(4760, 383)


<b>Preprocess data 

In [5]:
#label and target create

X = df.drop('importance',axis=1).values
y= df['importance'].values

In [6]:
#shape of X and y

print(X.shape)
print(y.shape)

(8878, 383)
(8878,)


In [7]:
#split the data in train and test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [8]:
#scale the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
#set test set

X1_test = df_test.values

In [10]:
#scale test set

X1_test = scaler.transform(X1_test)

## SVM

In [11]:
#import SVM and GridSearchCV

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [12]:
#use GridSearchCV for parameter tune

# svc = SVC()
# parameters = {
#     'kernel': ['rbf','poly'], #######85,82.5
#     'C': [0.1,0.5,1],
#     'gamma': [0.001,0.01,0.1,1]
# }

# svc = SVC()
# parameters = {
#     'kernel': ['linear', 'rbf','poly','sigmoid'],
#     'C': [0.1, 1]
# }

# cv_svm = GridSearchCV(svc, parameters, cv=5)
# cv_svm.fit(X_train, y_train)

# cv_svm.best_params_
# {'C': 1, 'kernel': 'poly'}

In [13]:
#fitting the data with hyperparameter from GridSearch

svc_model=SVC(C=1,kernel='rbf',gamma=0.1)
svc_model.fit(X_train, y_train)

SVC(C=1, gamma=0.1)

In [14]:
#Check the accuracy score

pred_svc = svc_model.predict(X_test)
print('Test Score', 100* accuracy_score(y_test,pred_svc))

pred_actual = svc_model.predict(X_train)
print('Train Score', 100* accuracy_score(y_train,pred_actual))

Test Score 85.72072072072072
Train Score 90.41754280564734


Got submission score- 82.7

## Naive Bayes

In [15]:
#import Multinomial naive bayes

from sklearn.naive_bayes import MultinomialNB
MultinomialNB()

MultinomialNB()

In [16]:
#hyperparameter tune

# nb = MultinomialNB()
# parameters = {
#     'alpha': [4,3,2,1, 0.1, 0.01, 0.001]  
# }
# cv_nb = GridSearchCV(nb, parameters, cv=5)
# cv_nb.fit(X_train, y_train)

# cv_nb.best_params_
#{'alpha': 4}

In [17]:
#fitting the data with hyperparameter from GridSearch
#Check the accuracy score

nb_model = MultinomialNB(alpha=4)
nb_model.fit(X_train, y_train)
pred_nb = nb_model.predict(X_test)
print('Score', 100* accuracy_score(y_test,pred_nb))


pred_actual = nb_model.predict(X_train)
print('Train Score', 100* accuracy_score(y_train,pred_actual))

Score 80.49549549549549
Train Score 80.6848903574647


Performs poorly

## Random Forest

In [18]:
#import RandomForest

from sklearn.ensemble import RandomForestClassifier

In [19]:
#hyperparameter tune

# rf = RandomForestClassifier()
# parameters = {
#     'n_estimators': [100,200,300], 
#     'max_depth': [10,12,14],
# }

# cv_rf = GridSearchCV(rf, parameters, cv=5)
# cv_rf.fit(X_train, y_train)

# cv_rf.best_params_ #{'max_depth': 14, 'n_estimators': 300}

In [20]:
#fitting the data with hyperparameter from GridSearch

rf_model=RandomForestClassifier(n_estimators=300,max_depth=14)
rf_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=14, n_estimators=300)

In [21]:
#Check the accuracy score

pred_rf = rf_model.predict(X_test)
print('Score', 100* accuracy_score(y_test,pred_rf))


pred_actual = rf_model.predict(X_train)
print('Train Score', 100* accuracy_score(y_train,pred_actual))

Score 86.57657657657658
Train Score 92.67047161309702


Got submission score- 84.75

## XGB

In [22]:
#import xgboost classifier

from xgboost import XGBClassifier

In [23]:
#GridSearch hyperparameter

# xgb = XGBClassifier()
# parameters = {
#     'n_estimators': [200,300], 
#     'max_depth': [8,10,12],
#     'learning_rate': [0.1,0.8],
#     'reg_lambda':[1,0.1]
# }

# cv_xgb = GridSearchCV(xgb, parameters, cv=5)
# cv_xgb.fit(X_train, y_train)

# cv_xgb.best_params_

In [35]:
#fitting the data with hyperparameter from GridSearch
#Check the accuracy score


xgb_model=XGBClassifier(n_estimators=200,max_depth=8,reg_lambda=1,learning_rate=0.1)
xgb_model.fit(X_train, y_train)
pred_xgb = xgb_model.predict(X_test)
print('Score', 100* accuracy_score(y_test,pred_xgb))

pred_actual = xgb_model.predict(X_train)
print('Score', 100* accuracy_score(y_train,pred_actual))

Score 88.51351351351352
Score 98.81345749474316


**As xgboost classifier has much higher accuracy, we select this model to predict the given test data for submission**

## Final Prediction

In [36]:
#Create final submission file with prediction from xgboost classifier

pred_final = xgb_model.predict(X1_test)
d_pred={'appno':app_no_testdata,'importance':pred_final}
pred_csv = pd.DataFrame(d_pred)

pred_csv.to_csv('prediction7.csv', index = False)