In [None]:
#IMPORTING REQUIRED LIBRARIS
#SOME FUNCTIONS ARE IMPORTED LATER WHEN THEY WERE NEEDED
import numpy as np
import pandas as pd
import random
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
np.random.seed(seed = 0)  #giving a seed to random functions

def decide_kopma(row):    #defined function to custom label encode our 'Kopma' column
    if row == 'Sorunsuz':
        return 0          #0 is no ticket
    else:
        return 1          #1 is ticket
print('Done!')

In [None]:
dc_df = pd.read_csv("../input/connectionlossdata/newdata.csv", header=0, index_col=0,low_memory=False)#reading our data
dc_df.Kopma.fillna(value="Sorunsuz", inplace=True) #filling 'no ticket' rows of 'kopma' column so they don't appear NA

dc_df['POSSEHIR'].fillna(value='A', inplace=True)   # filling all empty rows of categorical columns
dc_df['POSSEMT'].fillna(value='A', inplace=True)    # to label encode them later
dc_df['SANTRALADI'].fillna(value='A', inplace=True) #
dc_df['Atlak'].fillna(value='A', inplace=True)      #

dc_df['Kopma'] = dc_df.apply(lambda x: decide_kopma(x['Kopma']),axis=1) #applying our custom encoding function on 'Kopma' column

dc_df.iloc[:,5:] = dc_df.iloc[:,5:].apply(LabelEncoder().fit_transform) #Label encoding rest of our data

dc_df.to_csv("./newdata.csv")  #saving our modified .csv file since this process takes a long time
                               #and this will save us a lot of time

In [None]:
dc_df = pd.read_csv("../input/connectionlossdata/newdata.csv", header=0, index_col=0,low_memory=False) #reading our new saved .csv file
print(Counter(dc_df['Kopma'])) #checking how our 'Kopma' column is distributed
y = dc_df.Kopma    #creating our output row
X = dc_df.copy()   #creating our input matrix, copying since we will modify it thus preserving original
X.drop(columns='Kopma', inplace=True) #dropping output column from input matrix 
cols = list(X.columns)    # getting a list of our column names
selected_features = cols  # these cols are our selected features
print('Done!')

In [None]:
cols = list(X.columns)
print('All columns:')
print(cols)
pmax = 1                                                         # this block of code is the
all_cols = cols.copy()                                           # feature selection algorithm
while(len(cols)>0):                                              # how it exactly works is
    p=[]                                                         # beyond this course and us
    X_l = X[cols]                                                # thus we will not explain it
    X_l = sm.add_constant(X_l)
    model = sm.OLS(y,X_l).fit()
    p = pd.Series(model.pvalues.values[1:],index = cols)
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    if(pmax>0.05):
        cols.remove(feature_with_p_max)
    else:
        break

print('Selected columns:')
selected_features = cols    #showing the columns that were chosen by our algorithm
print(selected_features)
print('Eliminated columns:')#showing the eliminated columns
print([x for x in list(X.columns) if x not in selected_features])
X = X[selected_features] #our input matrix is now only our selected_features

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0) #splitting our data to so we have test material for later
print('Data split!')

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators = 200, random_state = 0) #creation of our classifier model
classifier.fit(X_train[:20000], y_train[:20000])      #fitting our data, we only used first 20K rows to save time
print('Model made!')

from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from matplotlib import pyplot as plt

fig, axs = plt.subplots(ncols = 2, figsize=(10,5))           #this block of code is formatting the plots
fig.tight_layout(pad=5.0)                                    #thus won't be explained
plt.rcParams.update({'font.size': 15})
axs[0].tick_params(axis='both', which='major', labelsize=15)
axs[1].tick_params(axis='both', which='major', labelsize=15)
disp = plot_confusion_matrix(classifier, X_test, y_test,     #creating confusion matrices to check how our model did
                             cmap=plt.cm.Blues,
                             ax=axs[0])
disp = plot_confusion_matrix(classifier, X_test, y_test,
                             cmap=plt.cm.Blues,
                             normalize='true',
                             ax=axs[1])

In [None]:
train_df = pd.concat([X_train, y_train], axis=1)           #prepating a dataframe to undersample our data

good_connections = train_df[train_df.Kopma == 0].index     #getting the indeces of our good_connection rows
sample_size = sum(train_df.Kopma == 1)  #getting how many bad_connections we have to gather only that many good_connection rows

random_indices = np.random.choice(good_connections, sample_size, replace=False) #choosing from our good_connection indices randomly
undersampled_df = train_df.loc[random_indices]  #creating a data frame from our randomly chosen good_connections
bad_connections = train_df.loc[dc_df.Kopma == 1].index  #getting the indices of our bad_connections
bad_connections_sample = train_df.loc[bad_connections]  #creating a data frame from our bad_connections
undersampled_df = undersampled_df.append(bad_connections_sample) #combining good/bad_connections data frames
print(Counter(undersampled_df['Kopma']))  #looking if they are bad/good are equally represented

y_train = undersampled_df.Kopma               #creaing new X,y_trains
X_train = undersampled_df[selected_features]

In [None]:
classifier = RandomForestClassifier(n_estimators = 200, random_state = 0) #creating new classifier
classifier.fit(X_train, y_train)  #fitting our undersampled data
print('Model made!')

fig, axs = plt.subplots(ncols = 2, figsize=(10,5))               #graph formatting
fig.tight_layout(pad=5.0)
plt.rcParams.update({'font.size': 15})
axs[0].tick_params(axis='both', which='major', labelsize=15)
axs[1].tick_params(axis='both', which='major', labelsize=15)
disp = plot_confusion_matrix(classifier, X_test, y_test,      #creating confusion matrix to check our model
                             cmap=plt.cm.Blues,
                             ax=axs[0])
disp = plot_confusion_matrix(classifier, X_test, y_test,
                             cmap=plt.cm.Blues,
                             normalize='true',
                             ax=axs[1])

In [None]:
from sklearn.neighbors import KNeighborsClassifier  #trying a new method of classifying

classifier = KNeighborsClassifier(n_neighbors=7, weights='distance') #we explained why we chose n=7 later
                                                                     #weights='Distance' performed better then other choices
classifier.fit(X_train, y_train)  #fitting data
print('Model done!')

fig, axs = plt.subplots(ncols = 2, figsize=(10,5))                #graph formatting
fig.tight_layout(pad=5.0)
plt.rcParams.update({'font.size': 15})
axs[0].tick_params(axis='both', which='major', labelsize=15)
axs[1].tick_params(axis='both', which='major', labelsize=15)
disp = plot_confusion_matrix(classifier, X_test, y_test,    #creating confusion matrix to check our model
                             cmap=plt.cm.Blues,
                             ax=axs[0])
disp = plot_confusion_matrix(classifier, X_test, y_test,
                             cmap=plt.cm.Blues,
                             normalize='true',
                             ax=axs[1])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
print('Data split!')

train_df = pd.concat([X_train, y_train], axis=1)     #this block of code works exactly the same as
                                                     #the prior undersampling, we just gathered
                                                     #600 more good_connections than bad_connections
                                                     #to represent the fact that good_connections
                                                     #DO appear more often than bad_connections
good_connections = train_df[train_df.Kopma == 0].index
sample_size = sum(train_df.Kopma == 1)
random_indices = np.random.choice(good_connections, sample_size+600, replace=False) #only difference between the prior code is in this line
undersampled_df = train_df.loc[random_indices]
bad_connections = train_df.loc[dc_df.Kopma == 1].index
bad_connections_sample = train_df.loc[bad_connections]
undersampled_df = undersampled_df.append(bad_connections_sample)
print(Counter(undersampled_df['Kopma']))
y_train = undersampled_df.Kopma
X_train = undersampled_df[selected_features]

In [None]:
knc_classifiers = list()

for i in range(4,10): #creaing many KNC models with different n values
    classifier = KNeighborsClassifier(n_neighbors= i, weights='distance')
    classifier.fit(X_train, y_train)
    knc_classifiers.append(classifier)
    
    accs = list()

for clf in knc_classifiers:  #testing every model we created
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred, normalize = "true")
    acc = (cm[0,0] + cm[1,1]) / cm.sum() #accuracy is based on normalized confusion matrix
                                         #as that's what is important for us
    print("Knn classifier with n = {}, acc = {}".format(clf.n_neighbors, acc))
    accs.append(acc)

fig, ax = plt.subplots(figsize = (8,8))  #creating a plot of accuracies

knc_ns = [clf.n_neighbors for clf in knc_classifiers]  #graphing lines won't be explained

ax.plot(knc_ns, accs, linewidth = 3.0)
ax.set_ylim([0,1])
ax.set_title('K-Neighbors Algorithm')
ax.set_xlabel("K")
ax.set_ylabel("Accuracy")
ax.grid()
ax.minorticks_on()
plt.plot()

In [None]:
n = np.argmax(accs) #getting the index our highest precision n value

classifier = KNeighborsClassifier(n_neighbors=n+4, weights='distance') #our n value is index+4
classifier.fit(X_train, y_train) #fitting data
print('Model done!')

fig, axs = plt.subplots(ncols = 2, figsize=(10,5))         #graphing function, same as prior ones
fig.tight_layout(pad=5.0)
plt.rcParams.update({'font.size': 15})
axs[0].tick_params(axis='both', which='major', labelsize=15)
axs[1].tick_params(axis='both', which='major', labelsize=15)
disp = plot_confusion_matrix(classifier, X_test, y_test,
                             cmap=plt.cm.Blues,
                             ax=axs[0])
disp = plot_confusion_matrix(classifier, X_test, y_test,
                             cmap=plt.cm.Blues,
                             normalize='true',
                             ax=axs[1])

RANDOM FOREST OPTIMUM DEPTH

In [None]:
#this block does for Random Forest what the prior one did for K Neighbors,
#basically we tried different depth values and chose the best one.
rfc_classifiers = list()

for max_depth in range(10,30):
    classifier = RandomForestClassifier(max_depth= max_depth, n_estimators = 200, random_state = 0)
    classifier.fit(X_train, y_train)
    rfc_classifiers.append(classifier)
    
    accs = list()

for clf in rfc_classifiers:
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred, normalize = "true")
    acc = (cm[0,0] + cm[1,1]) / cm.sum()
    print("RF classifier with n = {}, acc = {}".format(clf.max_depth, acc))
    accs.append(acc)

fig, ax = plt.subplots(figsize = (8,8))

rfc_ns = [clf.max_depth for clf in rfc_classifiers]

ax.plot(rfc_ns, accs, linewidth = 3.0)
ax.set_ylim([0,1])
ax.set_title('Random Forest Algorithm')
ax.set_xlabel("Depth")
ax.set_ylabel("Accuracy")
ax.grid()
ax.minorticks_on()
plt.plot()

In [None]:
best_depth = np.argmax(accs) #getting the index our highest precision depth value

classifier = RandomForestClassifier(max_depth=best_depth+10,  n_estimators = 200, random_state = 0) #our depth value is index+10
classifier.fit(X_train, y_train) #fitting data
print('Model done!')

fig, axs = plt.subplots(ncols = 2, figsize=(10,5))         #graphing function, same as prior ones
fig.tight_layout(pad=5.0)
plt.rcParams.update({'font.size': 15})
axs[0].tick_params(axis='both', which='major', labelsize=15)
axs[1].tick_params(axis='both', which='major', labelsize=15)
disp = plot_confusion_matrix(classifier, X_test, y_test,
                             cmap=plt.cm.Blues,
                             ax=axs[0])
disp = plot_confusion_matrix(classifier, X_test, y_test,
                             cmap=plt.cm.Blues,
                             normalize='true',
                             ax=axs[1])

In [None]:
#checking how our data performs without feature selection

dc_df = pd.read_csv("../input/connectionlossdata/newdata.csv", header=0, index_col=0,low_memory=False) #reading our new saved .csv file
print(Counter(dc_df['Kopma'])) #checking how our 'Kopma' column is distributed
y = dc_df.Kopma    #creating our output row
X = dc_df.copy()   #creating our input matrix, copying since we will modify it thus preserving original
X.drop(columns='Kopma', inplace=True) #dropping output column from input matrix 
cols = list(X.columns)    # getting a list of our column names
selected_features = cols  # these cols are our selected features

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
print('Data split!')

train_df = pd.concat([X_train, y_train], axis=1)     #this block of code works exactly the same as
                                                     #the prior undersampling, we just gathered
                                                     #600 more good_connections than bad_connections
                                                     #to represent the fact that good_connections
                                                     #DO appear more often than bad_connections
good_connections = train_df[train_df.Kopma == 0].index
sample_size = sum(train_df.Kopma == 1)
random_indices = np.random.choice(good_connections, sample_size+2500, replace=False) #only difference between the prior code is in this line
undersampled_df = train_df.loc[random_indices]
bad_connections = train_df.loc[dc_df.Kopma == 1].index
bad_connections_sample = train_df.loc[bad_connections]
undersampled_df = undersampled_df.append(bad_connections_sample)
print(Counter(undersampled_df['Kopma']))
y_train = undersampled_df.Kopma
X_train = undersampled_df[selected_features]

classifier = KNeighborsClassifier(n_neighbors=4, weights='distance') #our n value is index+4
classifier.fit(X_train, y_train) #fitting data
print('Model done!')

fig, axs = plt.subplots(ncols = 2, figsize=(10,5))         #graphing function, same as prior ones
fig.tight_layout(pad=5.0)
plt.rcParams.update({'font.size': 15})
axs[0].tick_params(axis='both', which='major', labelsize=15)
axs[1].tick_params(axis='both', which='major', labelsize=15)
disp = plot_confusion_matrix(classifier, X_test, y_test,
                             cmap=plt.cm.Blues,
                             ax=axs[0])
disp = plot_confusion_matrix(classifier, X_test, y_test,
                             cmap=plt.cm.Blues,
                             normalize='true',
                             ax=axs[1])