# HW06

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.svm import NuSVC
from sklearn.svm import OneClassSVM
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm

## Predict Country with Client IP

### 1. Data Preprocessing

At this stage, we are loading the data into a dataframe and then dropping the columns that are not needed for the model.

In [2]:
# load request.csv file into a dataframe
df = pd.read_csv('request.csv', on_bad_lines='skip')

# drop columns 0,1,2,3,6,7,8
df.drop(df.columns[[0,1,2,3,6,7,8]], axis=1, inplace=True)

# add headers to dataframe; first column is 'country', second column is 'ip'
df.columns = ['country', 'ip']

# encode ip as integer
le = LabelEncoder()
df['ip'] = le.fit_transform(df['ip'])

# print first 5 rows
print(df.head())

                  country     ip
0  Bosnia and Herzegovina   3156
1                  Gambia  42180
2                  Gambia  42180
3            Sierra Leone  29692
4            Sierra Leone  29692


### 2. Model Selection

We split into train and test sets. Our goal is to predict the country give an ip.

In [3]:
# split into train and test sets
train, test = train_test_split(df, test_size=0.2)

# split train and test sets into X and y
X_train = train.drop('country', axis=1)
y_train = train['country']

X_test = test.drop('country', axis=1)
y_test = test['country']

# encode labels
encoder = LabelEncoder()

y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

# scale features
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# try different models which are good for classification
models = {
    'logistic_regression': LogisticRegression(),
    'random_forest': RandomForestClassifier(),
    # 'gradient_boosting': GradientBoostingClassifier(),
    # 'decision_tree': DecisionTreeClassifier(),
    # 'knn': KNeighborsClassifier(),
    # 'gaussian_nb': GaussianNB(),
    # 'linear_svc': LinearSVC(),
    # 'nu_svc': NuSVC(),
    # 'one_class_svm': OneClassSVM(),
    # 'svc': SVC()
}

# train and test models
for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + ' trained.')
    y_pred = model.predict(X_test)
    print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))

random_forest trained.
Accuracy: 0.6681310912488007
linear_svc trained.
Accuracy: 0.017926576781295764
nu_svc trained.
Accuracy: 0.19153663586325304
one_class_svm trained.
Accuracy: 0.002575367368580518
svc trained.
Accuracy: 0.1285663788314902


### 3. Model Optimization

Random Forest is giving the best result, with an accuracy of 66.8%. We will try to optimize it by tuning the hyperparameters.

In [6]:
# use random search to find best parameters for random forest

# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]

# number of features to consider at every split
max_features = ['sqrt', 'log2']

# maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]

# minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# method of selecting samples for training each tree
bootstrap = [True, False]

# create random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

# use random grid to search for best hyperparameters
# first create base model to tune
rf = RandomForestClassifier()

# random search of parameters, using 3 fold cross validation
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

# fit the random search model
rf_random.fit(X_train, y_train)

# print best parameters
print(rf_random.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  warn(
  warn(
  warn(
  warn(
