# Data modelling

In [None]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.svm import SVC

In [None]:
df_train = pd.read_csv('ds-tech-interview/train_data_preparation.csv')

In [None]:
df_test = pd.read_csv('ds-tech-interview/test_data_preparation.csv')

## Model : 

### Specify which variable we want to target

In [None]:
y=df_train['TARGET_FLAG']

### Specify which variable we want to take as parameters

In [None]:
X=df_train[['KIDSDRIV_BIN','HOMEKIDS_BIN','URBANICITY_INDEX','HOME_VAL_BIN','logMVR_PTS','REVOKED_INDEX','logCLM_FREQ','CLM_FREQ','OLDCLAIM','CAR_TYPE_INDEX','TIF','logBLUEBOOK','CAR_USE_INDEX','logTRAVTIME','EDUCATION_BIN','MSTATUS_INDEX','logINCOME']]

### Cut dataframe into training and testing 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

### Testing several models

In [None]:
SVC_model = SVC()

In [None]:
KNN_model = KNeighborsClassifier(n_neighbors=100)

In [None]:
logreg_model = LogisticRegression(max_iter=300)

In [None]:
clf_model = tree.DecisionTreeClassifier()

In [None]:
SVC_model.fit(X_train, y_train)

In [None]:
KNN_model.fit(X_train, y_train)

In [None]:
logreg_model.fit(X_train,y_train)

In [None]:
clf_model.fit(X_train,y_train)

In [None]:
SVC_prediction = SVC_model.predict(X_test)

In [None]:
KNN_prediction = KNN_model.predict(X_test)

In [None]:
logreg_prediction = logreg_model.predict(X_test)

### Accuracy score from SVC prediction

In [None]:
print(accuracy_score(SVC_prediction, y_test))

### Accuracy score from KNN prediction

In [None]:
print(accuracy_score(KNN_prediction, y_test))

### Accuracy score from logreg prediction

In [None]:
print(accuracy_score(logreg_prediction, y_test))

### Print confusion matrix and classification report

In [None]:
print(confusion_matrix(SVC_prediction, y_test))

In [None]:
print(classification_report(KNN_prediction, y_test))

In [None]:
print(confusion_matrix(logreg_prediction, y_test))

### Find test TARGET_FLAG for logreg only (better accuracy win) 

In [None]:
X_target=df_test[['KIDSDRIV_BIN','HOMEKIDS_BIN','URBANICITY_INDEX','HOME_VAL_BIN','logMVR_PTS','REVOKED_INDEX','logCLM_FREQ','CLM_FREQ','OLDCLAIM','CAR_TYPE_INDEX','TIF','logBLUEBOOK','CAR_USE_INDEX','logTRAVTIME','EDUCATION_BIN','MSTATUS_INDEX','logINCOME']]

In [None]:
logreg_target = logreg_model.predict(X_target)

In [None]:
df_test['TARGET_FLAG']=logreg_target

In [None]:
df_test['TARGET_FLAG'].value_counts()

In [None]:
df_test[['INDEX','TARGET_FLAG']].to_csv('ds-tech-interview/TARGET_FLAG.csv',index=False)