# Importing Necessary Libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Reading the data file(csv)

In [4]:
df = pd.read_csv(r"C:\Users\PREETHAM\Downloads\Internship_data.csv")

In [5]:
df

Unnamed: 0,x1,x2,y
0,-119.366669,1.150000e+02,1
1,-101.108044,9.777716e+01,1
2,-130.278658,1.067677e+02,1
3,-114.703415,1.011955e+02,1
4,-119.366669,1.150000e+02,1
...,...,...,...
2222,98.714112,8.964312e+01,0
2223,96.633331,9.100000e+01,0
2224,85.673940,1.038393e+02,0
2225,78.958862,7.860537e+01,0


In [6]:
df.shape

(2227, 3)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2227 entries, 0 to 2226
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x1      2222 non-null   float64
 1   x2      2224 non-null   float64
 2   y       2227 non-null   int64  
dtypes: float64(2), int64(1)
memory usage: 52.3 KB


In [8]:
df.isnull().sum()

x1    5
x2    3
y     0
dtype: int64

It has missing values

# Dropping the null values

In [10]:
df = df.dropna()

In [11]:
df.shape

(2219, 3)

In [12]:
df.isnull().sum()

x1    0
x2    0
y     0
dtype: int64

In [13]:
df

Unnamed: 0,x1,x2,y
0,-119.366669,1.150000e+02,1
1,-101.108044,9.777716e+01,1
2,-130.278658,1.067677e+02,1
3,-114.703415,1.011955e+02,1
4,-119.366669,1.150000e+02,1
...,...,...,...
2222,98.714112,8.964312e+01,0
2223,96.633331,9.100000e+01,0
2224,85.673940,1.038393e+02,0
2225,78.958862,7.860537e+01,0


# Seperating the target column

In [14]:
target = df["y"]
df = df.drop("y",axis = 1)

In [18]:
target.value_counts()

0    1238
1     981
Name: y, dtype: int64

# Standardizing the data

In [19]:
from sklearn.preprocessing import StandardScaler

rescale = StandardScaler()

stand_train_data = rescale.fit_transform(df)

In [21]:
stand_train_data.shape

(2219, 2)

In [22]:
target.shape

(2219,)

# Test-Train split

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(stand_train_data, target,train_size = 0.80, random_state = 0)

# GridSearchCV and RandomizedSearchCV

In [27]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report

In [31]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 

# Logistic regression

In [34]:
import warnings
warnings.filterwarnings('ignore')

In [35]:
%%time
tuned_parameters = [{'penalty':["l1","l2","elasticnet"], 'C':[1, 10,100,1000]}]

evaluation_metric = ["accuracy", "f1","roc_auc","precision","recall"]
for value in evaluation_metric:
    

    clf = GridSearchCV(
        LogisticRegression(), tuned_parameters, scoring='{}'.format(value),cv = 10
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on train set:")
    print(clf.best_params_)
    print(clf.best_estimator_)
    print("For {}".format(value))
    print()
    print(clf.score(X_test, y_test))
    print()

Best parameters set found on train set:
{'C': 1, 'penalty': 'l2'}
LogisticRegression(C=1)
For accuracy

0.5968468468468469

Best parameters set found on train set:
{'C': 1, 'penalty': 'l2'}
LogisticRegression(C=1)
For f1

0.40531561461794013

Best parameters set found on train set:
{'C': 1, 'penalty': 'l2'}
LogisticRegression(C=1)
For roc_auc

0.5847516569838802

Best parameters set found on train set:
{'C': 1, 'penalty': 'l2'}
LogisticRegression(C=1)
For precision

0.6161616161616161

Best parameters set found on train set:
{'C': 1, 'penalty': 'l2'}
LogisticRegression(C=1)
For recall

0.30198019801980197

Wall time: 1.57 s


# SVM

In [44]:
%%time
tuned_parameters = [{'kernel': ['linear'], 'C': [1,10, 100, 1000]}
                    
                         ]
evaluation_metric = ["accuracy", "f1","roc_auc","precision","recall"]
for value in evaluation_metric:


    clf = RandomizedSearchCV(
        SVC(), tuned_parameters, scoring='{}'.format(value),cv= 10
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on train set:")
    print(clf.best_params_)
    print(clf.best_estimator_)
    print("For {}".format(value))
    print()

    print(clf.score(X_test, y_test))

Best parameters set found on train set:
{'kernel': 'linear', 'C': 1}
SVC(C=1, kernel='linear')
For accuracy

0.545045045045045
Best parameters set found on train set:
{'kernel': 'linear', 'C': 1}
SVC(C=1, kernel='linear')
For f1

0.0
Best parameters set found on train set:
{'kernel': 'linear', 'C': 1000}
SVC(C=1000, kernel='linear')
For roc_auc

0.5806194255789215
Best parameters set found on train set:
{'kernel': 'linear', 'C': 1}
SVC(C=1, kernel='linear')
For precision

0.0
Best parameters set found on train set:
{'kernel': 'linear', 'C': 1}
SVC(C=1, kernel='linear')
For recall

0.0
Wall time: 12min 41s


In [42]:
%%time
tuned_parameters = [{'kernel': ['rbf'], 'gamma': np.random.rand(3) , 'C': [1, 10, 100, 1000]}
                         ]
evaluation_metric = ["accuracy", "f1","roc_auc","precision","recall"]
for value in evaluation_metric:


    clf = RandomizedSearchCV(
        SVC(), tuned_parameters, scoring='{}'.format(value),cv= 10
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on train set:")
    print(clf.best_params_)
    print(clf.best_estimator_)
    print("For {}".format(value))
    print()

    print(clf.score(X_test, y_test))


Best parameters set found on train set:
{'kernel': 'rbf', 'gamma': 0.5408959669480411, 'C': 10}
SVC(C=10, gamma=0.5408959669480411)
For accuracy

0.5990990990990991
Best parameters set found on train set:
{'kernel': 'rbf', 'gamma': 0.5408959669480411, 'C': 10}
SVC(C=10, gamma=0.5408959669480411)
For f1

0.4573170731707317
Best parameters set found on train set:
{'kernel': 'rbf', 'gamma': 0.5408959669480411, 'C': 1000}
SVC(C=1000, gamma=0.5408959669480411)
For roc_auc

0.5938343834383439
Best parameters set found on train set:
{'kernel': 'rbf', 'gamma': 0.0788005070597696, 'C': 100}
SVC(C=100, gamma=0.0788005070597696)
For precision

0.6388888888888888
Best parameters set found on train set:
{'kernel': 'rbf', 'gamma': 0.5408959669480411, 'C': 1000}
SVC(C=1000, gamma=0.5408959669480411)
For recall

0.38613861386138615
Wall time: 2min 7s


# Decision Tree

In [40]:
%%time
tuned_parameters = [{'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}]

evaluation_metric = ["accuracy", "f1","roc_auc","precision","recall"]
for value in evaluation_metric:
    

    clf = GridSearchCV(
        DecisionTreeClassifier(), tuned_parameters, scoring='{}'.format(value),cv = 10
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on train set:")
    print(clf.best_params_)
    print(clf.best_estimator_)
    print("For {}".format(value))
    print()
    print(clf.score(X_test, y_test))
    print()

Best parameters set found on train set:
{'criterion': 'gini', 'max_depth': 6}
DecisionTreeClassifier(max_depth=6)
For accuracy

0.9481981981981982

Best parameters set found on train set:
{'criterion': 'gini', 'max_depth': 6}
DecisionTreeClassifier(max_depth=6)
For f1

0.9443099273607748

Best parameters set found on train set:
{'criterion': 'gini', 'max_depth': 6}
DecisionTreeClassifier(max_depth=6)
For roc_auc

0.9872555437361917

Best parameters set found on train set:
{'criterion': 'entropy', 'max_depth': 4}
DecisionTreeClassifier(criterion='entropy', max_depth=4)
For precision

1.0

Best parameters set found on train set:
{'criterion': 'entropy', 'max_depth': 6}
DecisionTreeClassifier(criterion='entropy', max_depth=6)
For recall

0.9801980198019802

Wall time: 5.8 s


# KNN

In [41]:
%%time
tuned_parameters = [{'n_neighbors':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p':[1, 2, 3]}]
evaluation_metric = ["accuracy", "f1","roc_auc","precision","recall"]
for value in evaluation_metric:
    

    clf = GridSearchCV(
        KNeighborsClassifier(), tuned_parameters, scoring='{}'.format(value), cv = 10
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on train set:")
    print(clf.best_params_)
    print(clf.best_estimator_)
    print("For {}".format(value))
    print()
    print(clf.score(X_test, y_test))
    print()

Best parameters set found on train set:
{'n_neighbors': 2, 'p': 1}
KNeighborsClassifier(n_neighbors=2, p=1)
For accuracy

0.5472972972972973

Best parameters set found on train set:
{'n_neighbors': 1, 'p': 1}
KNeighborsClassifier(n_neighbors=1, p=1)
For f1

0.544987146529563

Best parameters set found on train set:
{'n_neighbors': 10, 'p': 1}
KNeighborsClassifier(n_neighbors=10, p=1)
For roc_auc

0.5558669503313968

Best parameters set found on train set:
{'n_neighbors': 2, 'p': 1}
KNeighborsClassifier(n_neighbors=2, p=1)
For precision

0.5060240963855421

Best parameters set found on train set:
{'n_neighbors': 1, 'p': 1}
KNeighborsClassifier(n_neighbors=1, p=1)
For recall

0.5247524752475248

Wall time: 10.9 s


### After Hyperparameter tuning we observe that the Decision Tree Algorithm is best
### Decision Tree gives best precision and f1 score and it is very fast to compute