# HW06

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm

## Predict Country with Client IP

### 1. Data Preprocessing

At this stage, we are loading the data into a dataframe and then dropping the columns that are not needed for the model. Looking at the code for how IPs are generated, we can see that the first 3 octets are deterministic while the last octet is completely random. Therefore, we can use the first 3 octets to predict the country.

In [2]:
# load request.csv file into a dataframe
df = pd.read_csv('request.csv', on_bad_lines='skip')

# drop columns 0,1,2,3,6,7,8
df.drop(df.columns[[0,1,2,3,6,7,8]], axis=1, inplace=True)

# add headers to dataframe; first column is 'country', second column is 'ip'
df.columns = ['country', 'ip']

# split ip into 4 columns
df[['ip1','ip2','ip3','ip4']] = df.ip.str.split(".",expand=True,)

# drop ip column
df.drop(['ip'], axis=1, inplace=True)

# drop ip4 column
df.drop(['ip4'], axis=1, inplace=True)

# encode labels
encoder = LabelEncoder()

# encode country column
df['country'] = encoder.fit_transform(df['country'])

# print first 5 rows
print(df.head())

   country  ip1  ip2  ip3
0       21  113   98   85
1       60   95  222   97
2       60   95  222   97
3      153  244  222  137
4      153  244  222  137


### 2. Model Selection

We split into train and test sets. Our goal is to predict the country give an ip. We will try a few models and see which one performs better.

In [3]:
# split into train and test sets
train, test = train_test_split(df, test_size=0.2)

# split train and test sets into X and y
X_train = train.drop('country', axis=1)
y_train = train['country']

X_test = test.drop('country', axis=1)
y_test = test['country']

In [4]:
# try different models which are good for classification
models = {
    'logistic_regression': LogisticRegression(),
    'random_forest': RandomForestClassifier(),
    'decision_tree': DecisionTreeClassifier(),
    'knn': KNeighborsClassifier(),
    'gaussian_nb': GaussianNB(),
}

# train and test models
for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + ' trained.')
    y_pred = model.predict(X_test)
    print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))
    print()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


logistic_regression trained.
Accuracy: 0.08902691511387163

random_forest trained.
Accuracy: 1.0

decision_tree trained.
Accuracy: 1.0

knn trained.
Accuracy: 1.0

gaussian_nb trained.
Accuracy: 0.20956420744331666



### 3. Model Evaluation

Random Forest, Decision Tree, and KNN are the most accurate models, with an accuracy of 1. So, let's properly build a Random Forest model and see how it performs.

In [5]:
rf = RandomForestClassifier()

# train model
rf.fit(X_train, y_train)

# predict
y_pred = rf.predict(X_test)

# print accuracy, confusion matrix, and classification report
print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))
print()

print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print()

print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 1.0

Confusion Matrix:
[[ 96   0   0 ...   0   0   0]
 [  0 109   0 ...   0   0   0]
 [  0   0 100 ...   0   0   0]
 ...
 [  0   0   0 ...  93   0   0]
 [  0   0   0 ...   0 105   0]
 [  0   0   0 ...   0   0 101]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        96
           1       1.00      1.00      1.00       109
           2       1.00      1.00      1.00       100
           3       1.00      1.00      1.00        88
           4       1.00      1.00      1.00       106
           5       1.00      1.00      1.00        96
           6       1.00      1.00      1.00       105
           7       1.00      1.00      1.00        94
           8       1.00      1.00      1.00        95
           9       1.00      1.00      1.00       102
          10       1.00      1.00      1.00       101
          11       1.00      1.00      1.00       104
          12       1.00      1.00      1.00     

## Predict Income

### 1. Data Preprocessing

Looking at the code, we can see that the gender, age, income, time, and ip are all generated randomly. Let's see if we can predict the income given the other features.

In [5]:
def process_income_data():
  # load request.csv file into a dataframe
  df = pd.read_csv('request.csv', on_bad_lines='skip')

  # remove column 1,3,5 
  df.drop(df.columns[[0,1,3,5]], axis=1, inplace=True)

  # add headers to dataframe: id, time, file, country, gender, age, income
  df.columns = ['file', 'country', 'gender', 'age', 'income']

  # turn file column from 'html/1.html' to '1'
  df['file'] = df['file'].str.replace('html/', '').str.replace('.html', '')

  # turn into boolean values
  df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

  list_of_ages = ['0-16', '17-25', '26-35', '36-45', '46-55', '56-65', '66-75', '76+']
  list_of_incomes = ['0-10k', '10k-20k', '20k-40k', '40k-60k', '60k-100k', '100k-150k', '150k-250k', '250k+']

  # turn age into integer values
  df['age'] = df['age'].map({'0-16': 0, '17-25': 1, '26-35': 2, '36-45': 3, '46-55': 4, '56-65': 5, '66-75': 6, '76+': 7})

  # turn income into integer values
  df['income'] = df['income'].map({'0-10k': 0, '10k-20k': 1, '20k-40k': 2, '40k-60k': 3, '60k-100k': 4, '100k-150k': 5, '150k-250k': 6, '250k+': 7})

  # encode country labels
  encoder = LabelEncoder()

  # encode country column
  df['country'] = encoder.fit_transform(df['country'])

  # print first 5 rows
  print(df.head())

  return df

df = process_income_data()

   file  country  gender  age  income
0  3475       21       0    6       7
1  4678       60       1    5       7
2  4678       60       1    5       7
3  2116      153       1    5       3
4  2116      153       1    5       3


### 2. Model Selection

Let's try a few different classification models and see which one performs better. We're going to make a test/train split and then fit the models on the training data.

In [7]:
def generate_train_test_split(df):
    # create test train split
    train, test = train_test_split(df, test_size=0.2)

    # split train and test sets into X and y; we are attempting to guess income
    X_train = train.drop('income', axis=1)
    y_train = train['income']

    X_test = test.drop('income', axis=1)
    y_test = test['income']

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = generate_train_test_split(df)

In [23]:
models = {
    'logistic_regression': LogisticRegression(),
    'random_forest': RandomForestClassifier(),
    'decision_tree': DecisionTreeClassifier(),
    'knn': KNeighborsClassifier(),
    'gaussian_nb': GaussianNB(),
}

# train and test models
for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + ' trained.')
    y_pred = model.predict(X_test)
    print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))
    print()

logistic_regression trained.
Accuracy: 0.12659698025551683

random_forest trained.
Accuracy: 0.8238650709488461

decision_tree trained.
Accuracy: 0.8210372165833459

knn trained.
Accuracy: 0.20951371004393274

gaussian_nb trained.
Accuracy: 0.12710195424935616



## 3. Model Evaluation

The most accurate model was Random Forest, with an accuracy of ~82%. This may be due to its ability to predict psuedo-random data. Let's build a Random Forest model and see how it performs.

In [24]:
rf = RandomForestClassifier()

# train model
rf.fit(X_train, y_train)

# predict
y_pred = rf.predict(X_test)

# print accuracy, confusion matrix, and classification report
print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))
print()

print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print()

print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.8249760137352926

Confusion Matrix:
[[2054   76   70   68   68   62   56   49]
 [  56 2063   66   77   57   64   72   61]
 [  62   38 2055   56   67   80   44   76]
 [  70   54   56 2035   56   40   56   48]
 [  66   90   60   66 2018   64   57   66]
 [  55   62   61   52   68 2071   57   54]
 [  60   64   62   52   51   58 2039   50]
 [  66   82   67   64   66   76   65 2002]]

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.82      0.82      2503
           1       0.82      0.82      0.82      2516
           2       0.82      0.83      0.83      2478
           3       0.82      0.84      0.83      2415
           4       0.82      0.81      0.82      2487
           5       0.82      0.84      0.83      2480
           6       0.83      0.84      0.84      2436
           7       0.83      0.80      0.82      2488

    accuracy                           0.82     19803
   macro avg       0.83      0.83      0.

### Predict Income (Remove Duplicates)

What happens if we remove all duplicate data? Then our accuracy will be much worse. Let's see how much worse.

In [11]:
df = process_income_data()

# remove all duplicates from dataframe
df.drop_duplicates(inplace=True)

# print the number of rows and columns
print(df.shape)

X_train, y_train, X_test, y_test = generate_train_test_split(df)

# train random forest model
rf = RandomForestClassifier()

# train model
rf.fit(X_train, y_train)

# predict
y_pred = rf.predict(X_test)

# print accuracy, confusion matrix, and classification report
print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))
print()

print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

print('Classification Report:')
print(classification_report(y_test, y_pred))

   file  country  gender  age  income
0  3475       21       0    6       7
1  4678       60       1    5       7
2  4678       60       1    5       7
3  2116      153       1    5       3
4  2116      153       1    5       3
(49500, 5)
Accuracy: 0.12323232323232323

Confusion Matrix:
[[159 138 174 187 151 157 147 164]
 [153 159 161 152 163 142 142 137]
 [172 150 143 147 172 161 153 165]
 [158 153 164 133 145 144 152 135]
 [165 148 148 162 155 137 151 153]
 [154 153 165 155 153 179 144 158]
 [136 177 169 152 157 156 159 136]
 [166 166 183 174 130 149 144 133]]
Classification Report:
              precision    recall  f1-score   support

           0       0.13      0.12      0.13      1277
           1       0.13      0.13      0.13      1209
           2       0.11      0.11      0.11      1263
           3       0.11      0.11      0.11      1184
           4       0.13      0.13      0.13      1219
           5       0.15      0.14      0.14      1261
           6       0.13      