In [1]:
import pandas as pd
import sklearn as sk
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import sys
from sklearn.feature_selection import RFE, RFECV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
sys.setrecursionlimit(10**6) 

In [2]:
csv_file = pd.read_csv("pres2_v2.csv")
print(csv_file.shape)
print(csv_file[csv_file["class"]==0].shape, csv_file[csv_file["class"]==1].shape)


(34141, 59)
(31463, 59) (2678, 59)


### Cleaning all the data 

In [3]:
csv_file.pop('index')
csv_file["city"] = csv_file.city.str.lower()
csv_file["county"] = csv_file.county.str.lower()
csv_file["ipcity"] = csv_file.ipcity.str.lower()
csv_file["ipstate"] = csv_file.ipstate.str.lower()
csv_file["browser"] = csv_file.browser.str.lower()
csv_file["os"] = csv_file.os.str.lower()
csv_file["os_v"] = csv_file.os_v.str.lower()
csv_file["device"] = csv_file.device.str.lower()
csv_file["device_family"] = csv_file.device_family.str.lower()
csv_file["device_model"] = csv_file.device_model.str.lower()


In [4]:
csv_file["city"] = csv_file.city.astype('category').cat.codes
csv_file["county"] = csv_file.county.astype('category').cat.codes
csv_file["ipcity"] = csv_file.ipcity.astype('category').cat.codes
csv_file["ipstate"] = csv_file.ipstate.astype('category').cat.codes
csv_file["browser"] = csv_file.browser.astype('category').cat.codes
csv_file["os"] = csv_file.os.astype('category').cat.codes
csv_file["os_v"] = csv_file.os_v.astype('category').cat.codes
csv_file["device"] = csv_file.device.astype('category').cat.codes
csv_file["device_family"] = csv_file.device_family.astype('category').cat.codes
csv_file["device_model"] = csv_file.device_model.astype('category').cat.codes
csv_file["ippostcode"] = csv_file.ippostcode.astype('category').cat.codes

csv_file.head()

Unnamed: 0,city,county,postcode,ipcity,ipstate,ippostcode,browser,browser_v,os,os_v,...,attr36,attr37,attr38,attr39,attr40,attr41,attr42,attr44,attr45,class
0,1429,8,110034,300,29,388,1,80.0,0,38,...,0.801286,0.0,0.0,28.774,135.89145,25.797,22.642405,0.673,3.30301,0
1,783,29,500001,331,74,1155,13,123.0,0,76,...,2.2305,0.0,0.0,37.751,22.31925,10.393857,12.644706,1.622,3.486603,0
2,783,29,500001,331,74,1155,13,123.0,0,76,...,2.514,0.0,0.0,28.6435,18.9494,9.43675,13.253333,1.986,3.486603,0
3,1127,25,141001,472,65,467,13,136.0,0,76,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.906,3.607452,0
4,952,25,144411,353,65,498,16,12.1,2,23,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.095,3.609457,0


### Check for the class imbalance and take equal entries from both

In [5]:
majority = csv_file[csv_file["class"]==0]
minority = csv_file[csv_file["class"]==1]
print("Majority shape : ", majority.shape)
print("Minority shape : ", minority.shape)

new_df = pd.concat([majority.sample(minority.shape[0],), minority])
new_df.shape

Majority shape :  (31463, 58)
Minority shape :  (2678, 58)


(5356, 58)

In [6]:
# Split your train and test data in 7:3 ration
train_x,test_x= sk.model_selection.train_test_split(new_df, test_size=0.30)
train_x.shape

train_y = train_x.pop("class") 
test_y = test_x.pop("class") 

### Train a gradient boosting classifier

In [7]:
clf = GradientBoostingClassifier(random_state=0,n_estimators=100)
selector = RFE(clf)
selector = selector.fit(train_x, train_y)
print(selector.support_)

pred = selector.predict(test_x)

[ True  True  True  True False  True False  True False  True False False
  True False False False  True False False  True False False False  True
 False  True False  True False False False  True  True  True  True False
 False  True  True False  True False False False  True  True False False
 False False False  True  True  True  True  True  True]


In [8]:
print("Macro f1 : ",f1_score(test_y, pred, average="macro"))
print("Macro Precission : ",precision_score(test_y, pred, average="macro"))
print("Macro Recall : ",recall_score(test_y, pred, average="macro"))  
print("Macro f1 : ",f1_score(test_y, pred, average="micro"))
print("Macro Precission : ",precision_score(test_y, pred, average="micro"))
print("Macro Recall : ",recall_score(test_y, pred, average="micro"))  
print("Binary f1 : ",f1_score(test_y, pred, average="binary"))
print("Binary Precission : ",precision_score(test_y, pred, average="binary"))
print("Binary Recall : ",recall_score(test_y, pred, average="binary"))  
print(confusion_matrix(test_y,pred))

Macro f1 :  0.7919100431515695
Macro Precission :  0.8004347853613293
Macro Recall :  0.7940930852300017
Macro f1 :  0.7927815805849409
Macro Precission :  0.7927815805849409
Macro Recall :  0.7927815805849409
Binary f1 :  0.7784431137724551
Binary Precission :  0.8527696793002916
Binary Recall :  0.7160342717258262
[[689 101]
 [232 585]]


In [9]:
output_df = pd.DataFrame()

In [10]:
output_df["actual_class"] = test_y
output_df["predicted_class"] = pred
output_df.head()

Unnamed: 0,actual_class,predicted_class
24445,0,1
32175,1,0
32111,1,0
23090,0,0
11606,0,0
