In [2]:
import pandas as pd
data = pd.read_csv("Train.csv")

In [3]:
data.head()

Unnamed: 0,id_code,current_date,current_time,source_name,destination_name,train_name,target,country_code_source,longitude_source,latitude_source,mean_halt_times_source,country_code_destination,longitude_destination,latitude_destination,mean_halt_times_destination,current_year,current_week,current_day,is_weekend
0,isfywypmkqqhyft,2016-07-27,08:05:51 PM,station$147,station$1,ICZVZS,high,whber,4.356801,50.845658,634.16474,,,,,2016,30,Wednesday,False
1,mqsfxyvuqpbwomk,2016-07-27,08:06:11 PM,station$147,station$1,ICZVZS,high,whber,4.356801,50.845658,634.16474,,,,,2016,30,Wednesday,False
2,alspwwtbdvqsgby,2016-07-27,08:08:57 PM,station$147,station$1,ICZVZS,high,whber,4.356801,50.845658,634.16474,,,,,2016,30,Wednesday,False
3,szitxhhqduyrqpg,2016-07-27,08:09:08 PM,station$147,station$1,ICZVZS,high,whber,4.356801,50.845658,634.16474,,,,,2016,30,Wednesday,False
4,krisdqzczivvwcp,2016-07-27,08:11:01 PM,station$147,station$1,ICZVZS,high,whber,4.356801,50.845658,634.16474,,,,,2016,30,Wednesday,False


## Dropping all rows with NA

In [4]:
data.dropna(inplace=True)

In [5]:
print(data.shape)

(1250, 19)


## Dropping columns with many distinct levels

In [6]:
data.drop(['destination_name','id_code','source_name','train_name','country_code_destination', 'country_code_source'], axis=1, inplace=True)

In [7]:
data.dtypes

current_date                    object
current_time                    object
target                          object
longitude_source               float64
latitude_source                float64
mean_halt_times_source         float64
longitude_destination          float64
latitude_destination           float64
mean_halt_times_destination    float64
current_year                     int64
current_week                     int64
current_day                     object
is_weekend                        bool
dtype: object

In [8]:
data['current_hour'] = pd.to_datetime(data.current_time).dt.hour

## Dropping current_date, year, current_time

In [9]:
data.drop(['current_date','current_year','current_time','current_week'], axis=1, inplace=True)

In [10]:
data.columns

Index(['target', 'longitude_source', 'latitude_source',
       'mean_halt_times_source', 'longitude_destination',
       'latitude_destination', 'mean_halt_times_destination', 'current_day',
       'is_weekend', 'current_hour'],
      dtype='object')

## Clustering - Spacial analytics

In [11]:
data.columns

Index(['target', 'longitude_source', 'latitude_source',
       'mean_halt_times_source', 'longitude_destination',
       'latitude_destination', 'mean_halt_times_destination', 'current_day',
       'is_weekend', 'current_hour'],
      dtype='object')

In [12]:
spacial_data = data[['longitude_destination', 'latitude_destination', 'longitude_source', 'latitude_source']]

## Normalizing spacial data

In [13]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
spacial_data = pd.DataFrame(scaler.fit_transform(spacial_data), columns=spacial_data.columns)

In [14]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
wss = {}
for k in range(2, 15):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(spacial_data)
    wss[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.figure()
plt.plot(list(wss.keys()), list(wss.values()))
plt.xlabel("Number of cluster")
plt.ylabel("wss")
plt.show()

<Figure size 640x480 with 1 Axes>

In [15]:
from sklearn.cluster import KMeans
kmeans_object = KMeans(n_clusters=4, random_state=1240)
kmeans_object.fit(spacial_data)
data['cluster'] = kmeans_object.predict(spacial_data)

## Decoupling target

In [16]:
y = data['target']
X = data[data.columns.difference(['target'])]

## Train - Test split

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=99, stratify=y)

## Dummification

In [18]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [19]:
print(X_train.shape)
print(X_test.shape)

(1000, 16)
(250, 16)


## Data normalization

In [20]:
from sklearn.preprocessing import MinMaxScaler
full_scaler = MinMaxScaler()
full_scaler.fit(X_train)
X_train = pd.DataFrame(full_scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(full_scaler.transform(X_test), columns=X_test.columns)

## Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

logistic = LogisticRegression(class_weight='balanced')
logistic.fit(X_train, y_train)

logistic_train_preds = logistic.predict(X_train)
logistic_test_preds = logistic.predict(X_test)

print(confusion_matrix(y_train, logistic_train_preds))
print(confusion_matrix(y_test, logistic_test_preds))

print(classification_report(y_train, logistic_train_preds))
print(classification_report(y_test, logistic_test_preds))



[[133 108  60]
 [ 89 283  53]
 [ 87 123  64]]
[[29 36 10]
 [22 70 14]
 [18 34 17]]
              precision    recall  f1-score   support

        high       0.43      0.44      0.44       301
         low       0.55      0.67      0.60       425
      medium       0.36      0.23      0.28       274

    accuracy                           0.48      1000
   macro avg       0.45      0.45      0.44      1000
weighted avg       0.46      0.48      0.47      1000

              precision    recall  f1-score   support

        high       0.42      0.39      0.40        75
         low       0.50      0.66      0.57       106
      medium       0.41      0.25      0.31        69

    accuracy                           0.46       250
   macro avg       0.44      0.43      0.43       250
weighted avg       0.45      0.46      0.45       250



In [22]:
from sklearn.svm import SVC
svm1 = SVC(class_weight='balanced', kernel='rbf', C=0.2, gamma=1)
svm1.fit(X_train, y_train)

logistic_train_preds = svm1.predict(X_train)
logistic_test_preds = svm1.predict(X_test)

print(confusion_matrix(y_train, logistic_train_preds))
print(confusion_matrix(y_test, logistic_test_preds))

print(classification_report(y_train, logistic_train_preds))
print(classification_report(y_test, logistic_test_preds))

[[149  83  69]
 [ 96 255  74]
 [ 79  85 110]]
[[30 23 22]
 [34 53 19]
 [21 28 20]]
              precision    recall  f1-score   support

        high       0.46      0.50      0.48       301
         low       0.60      0.60      0.60       425
      medium       0.43      0.40      0.42       274

    accuracy                           0.51      1000
   macro avg       0.50      0.50      0.50      1000
weighted avg       0.51      0.51      0.51      1000

              precision    recall  f1-score   support

        high       0.35      0.40      0.38        75
         low       0.51      0.50      0.50       106
      medium       0.33      0.29      0.31        69

    accuracy                           0.41       250
   macro avg       0.40      0.40      0.40       250
weighted avg       0.41      0.41      0.41       250

