In [1]:
##This code predicts which of the user profiles with listings in a housing website is a scammer. This dataset is a skewed dataset
#which means the number of entries with y=0 (not a scammer) is much more than y=1 (is a scammer). Therefore I am using F1 score
#as a metric to benchmark the model performance. I am using first a simple logistic regression model and then a more involved
#neural network model

import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
#Loading dataset
missing_values = ['na','--','?','-','None','none','non','null','NaN','']
X_train_temp = pd.read_csv('HA_Data_Science_Train.csv',na_values=missing_values)

In [3]:
X_train_temp.shape


(16762, 15)

In [5]:
X_train_temp.head(10)


Unnamed: 0,LISTING_KIND,LISTING_CITY,LISTING_PRICE,IS_ARCHIVED,ARCHIVE_REASON,LOGIN_COUNTRY_CODE,LISTING_COUNTRY_CODE,LISTING_REGISTRATION_POSSIBLE,ADVERTISER_COMPLETENESS_SCORE,MANAGED_ACCOUNT,HAS_PROFILE_PIC,BROWSER,OS,IS_SCAMMER,ANONYMISED_EMAIL
0,1,Göteborg,725.0,False,,se,se,0,80,False,1,Mobile Safari,iOS,0,bosqlqg@aol.com
1,1,Göteborg,469.35,False,,se,se,1,70,False,1,Chrome,Android,0,fylm235670@gmail.com
2,1,Elche,200.0,False,,es,es,0,25,False,0,Chrome,Android,0,mhzasjonfobc@gmail.com
3,0,Verona,1500.0,False,,it,it,0,86,False,1,Chrome,Mac OS X,0,uclypkijb@italianflat.com
4,1,Copenhagen,904.16,False,,dk,dk,0,5,False,1,Microsoft Edge,Windows,0,dyrrtatzx.fhcs@outlook.com
5,0,Nice,1900.0,False,,it,fr,0,50,False,1,Chrome,Windows,0,tahfbqk.ovgb@gmail.com
6,1,Elche,175.0,False,,es,es,0,40,False,0,Chrome,Windows,0,ipeoq@jjvela.net
7,1,Elche,160.0,False,,ar,es,1,45,False,1,Safari,Mac OS X,0,qroaqfdgld@gmail.com
8,0,Verona,1500.0,False,,it,it,0,86,False,1,Chrome,Android,0,pumhmmkgy@italianflat.com
9,0,Copenhagen,205.75,True,,dk,dk,1,5,False,0,Chrome,Android,0,nbgmjktjkt@hotmail.com


In [10]:
X_train_temp.IS_ARCHIVED.dtype

dtype('bool')

In [None]:
count_y_ones = (X_train_temp['IS_SCAMMER']==1).to_numpy()

In [None]:
#counting the number of 'IS_SCAMMER=1 entries'
print(np.sum(count_y_ones))

In [None]:
#counting number of entries with 'nan' for different columns
print(X_train_temp['BROWSER'].isna().sum())
print(X_train_temp['OS'].isna().sum())
print(X_train_temp['LOGIN_COUNTRY_CODE'].isna().sum())
print(X_train_temp['LISTING_COUNTRY_CODE'].isna().sum())

In [None]:
X_train = X_train_temp.copy()

In [None]:
X_train.BROWSER.fillna('uknown',inplace=True)
X_train.OS.fillna('unknown',inplace=True)

In [None]:
#converting columns to categorical variables 
X_train['BROWSER']=X_train['BROWSER'].astype('category')
X_train['OS']=X_train['OS'].astype('category')
X_train['LOGIN_COUNTRY_CODE']=X_train['LOGIN_COUNTRY_CODE'].astype('category')
X_train['LISTING_COUNTRY_CODE']=X_train['LISTING_COUNTRY_CODE'].astype('category')


In [None]:
#making a list of categories for different columns
browsers = list(X_train['BROWSER'].dtype.categories)
os = list(X_train['OS'].dtype.categories)
login_countries =list(X_train['LOGIN_COUNTRY_CODE'].dtype.categories)
listing_countries = list(X_train['LISTING_COUNTRY_CODE'].dtype.categories)

In [None]:
print(browsers)

In [None]:
print(os)

In [None]:
print(login_countries)

In [None]:
print(listing_countries)

In [None]:
print(browsers)

In [None]:
print(X_train)


In [None]:
X_is_scammer = X_train[X_train['IS_SCAMMER']==1]
X_not_scammer = X_train[X_train['IS_SCAMMER']==0]

In [None]:
X_is_scammer['OS'].value_counts().plot(kind='bar')

In [None]:
X_is_scammer['BROWSER'].value_counts().plot(kind='bar')

In [None]:
a4_dims = (6, 6)
fig, ax = plt.subplots(figsize=a4_dims)
sns.countplot(ax=ax,x='IS_SCAMMER',hue='OS',data=X_train)

In [None]:
fig, ax = plt.subplots(figsize=a4_dims)
sns_plot=sns.countplot(ax=ax,x='IS_SCAMMER',hue='BROWSER',data=X_train)

In [None]:
browser_counts_is_scammer = X_is_scammer['BROWSER'].value_counts()
os_counts_is_scammer = X_is_scammer['OS'].value_counts()

In [None]:
print(browser_counts_is_scammer)

In [None]:
login_country_counts_is_scammer = X_is_scammer['LOGIN_COUNTRY_CODE'].value_counts()
login_country_counts_not_scammer = X_not_scammer['LOGIN_COUNTRY_CODE'].value_counts()

In [None]:
print(login_country_counts_not_scammer[0:20])

In [None]:
login_country_counts_not_scammer.index


In [None]:
login_countries_list = list(login_country_counts_is_scammer.index)

In [None]:
##creating a smaller list of login countries
login_countries_list_small = login_countries_list[0:16]


In [None]:
print(login_countries_list_small)

In [None]:
#Keeping login country same if present in above list else replace by 'other'
X_train['LOGIN_COUNTRY_CODE']=[x if x in login_countries_list_small else 'other' for x in X_train['LOGIN_COUNTRY_CODE']]

In [None]:
X_train['LISTING_COUNTRY_CODE']=[x if x in login_countries_list_small else 'other' for x in X_train['LISTING_COUNTRY_CODE']]

In [None]:
a4_dims = (14, 10)
fig, ax = plt.subplots(figsize=a4_dims)
sns_plot=sns.countplot(ax=ax,x='IS_SCAMMER',hue='LOGIN_COUNTRY_CODE',data=X_train)

In [None]:
a4_dims = (14, 10)
fig, ax = plt.subplots(figsize=a4_dims)
sns_plot=sns.countplot(ax=ax,x='IS_SCAMMER',hue='LISTING_COUNTRY_CODE',data=X_train)

In [None]:
#Creating a additional Boolean column which is 1 of login and listing countries are same else 0.
X_train['login_equals_listing'] = X_train['LOGIN_COUNTRY_CODE']==X_train['LISTING_COUNTRY_CODE']

In [None]:
#in the plot below we see that probability for being a scammer is higher when login_equals_listing is false compared to when it
#is true
a4_dims = (7, 5)
fig, ax = plt.subplots(figsize=a4_dims)
sns_plot=sns.countplot(ax=ax,x='IS_SCAMMER',hue='login_equals_listing',data=X_train)

In [None]:
a4_dims = (7,5)
fig,ax = plt.subplots(figsize=a4_dims)
sns.scatterplot(x='LISTING_PRICE',y='IS_SCAMMER',data=X_train,ax=ax)
ax.set_xlim(0,200000)

In [None]:
fig,ax = plt.subplots(figsize=a4_dims)
sns.scatterplot(x='ADVERTISER_COMPLETENESS_SCORE',y='IS_SCAMMER',data=X_train)

In [None]:
X_train

In [None]:
#converting categorical columns into numerical
X_train['login_equals_listing']=X_train['login_equals_listing'].astype('category').cat.codes
X_train['MANAGED_ACCOUNT']=X_train['MANAGED_ACCOUNT'].astype('category').cat.codes
#Converting %completenss score into a fraction
X_train['ADVERTISER_COMPLETENESS_SCORE']=X_train['ADVERTISER_COMPLETENESS_SCORE']/100


In [None]:
X_train

In [None]:
mean_price = X_train.LISTING_PRICE.mean()
stddev = X_train.LISTING_PRICE.std()

In [None]:
#Standardizing the lsiting price
X_train['LISTING_PRICE']=(X_train['LISTING_PRICE']-mean_price)/stddev


In [None]:
X_train

In [None]:
#Calculating one-hot vectors for OS and BROWSER
OS_onehot = pd.get_dummies(X_train['OS'],prefix='OS')
BROWSER_onehot = pd.get_dummies(X_train['BROWSER'],prefix='BROWSER')


In [None]:
BROWSER_onehot

In [None]:
OS_onehot

In [None]:
#One-hot vectors for login and listing countries
LOGIN_onehot = pd.get_dummies(X_train['LOGIN_COUNTRY_CODE'],prefix='LOGIN')
LISTING_onehot = pd.get_dummies(X_train['LISTING_COUNTRY_CODE'],prefix='LISTING')

In [None]:
LISTING_onehot

In [None]:
X_train = pd.concat([X_train,OS_onehot,BROWSER_onehot,LOGIN_onehot,LISTING_onehot],axis=1)

In [None]:
X_train


In [None]:
X_train = X_train.drop(['LISTING_CITY','IS_ARCHIVED','ARCHIVE_REASON','ANONYMISED_EMAIL','OS','BROWSER','LOGIN_COUNTRY_CODE','LISTING_COUNTRY_CODE'],axis=1)

In [None]:
X_train


In [None]:
X_train = X_train.drop(['OS_BlackBerry','BROWSER_BlackBerry','LISTING_ng','BROWSER_Firefox iOS'],axis=1) ##removing entries not found in a later test dataset

In [None]:
X_train

In [None]:
X_train_fit, X_test_fit, y_train_fit, y_test_fit = train_test_split(X_train.drop(['IS_SCAMMER'],axis=1),X_train['IS_SCAMMER'],test_size=0.40,random_state=101)


In [None]:
logmodel = LogisticRegression(max_iter=800,solver='lbfgs')

In [None]:
logmodel.fit(X_train_fit,y_train_fit)

In [None]:
predictions_train = logmodel.predict(X_train_fit)
train_prob = logmodel.predict_proba(X_train_fit)
precision_train,recall_train,thresholds_train = precision_recall_curve(y_train_fit,train_prob[:,1])
f1_score_train = 2*precision_train*recall_train/(precision_train+recall_train)
plt.plot(thresholds_train, precision_train[: -1], "g--", label="Precision")
plt.plot(thresholds_train, recall_train[: -1], "b--", label="Recall")
plt.plot(thresholds_train,f1_score_train[: -1],"r--",label="F1Score")
plt.xlabel('thresholds')
plt.ylabel('Precision/Recall')

In [None]:
print(max(f1_score_train))
new_threshold = thresholds_train[np.argmax(f1_score_train)] #defining new threshold for logistic regression which maximises the f1_score
print(new_threshold)

In [None]:
predictions_test = logmodel.predict(X_test_fit)
test_prob = logmodel.predict_proba(X_test_fit)
precision_test,recall_test,thresholds_test = precision_recall_curve(y_test_fit,test_prob[:,1])
f1_score_test = 2*precision_test*recall_test/(precision_test+recall_test)
print(max(f1_score_test))
new_threshold = thresholds_test[np.argmax(f1_score_test)]
print(new_threshold)

In [None]:
##using a neural netword model to maximize F1 score and find new threshold
nnmodel=MLPClassifier(solver='adam', learning_rate_init=0.009,alpha=0.3,hidden_layer_sizes=(10,50), random_state=1,max_iter=600)
nnmodel.fit(X_train_fit,y_train_fit)
train_prob = nnmodel.predict_proba(X_train_fit)
precision_train,recall_train,thresholds_train = precision_recall_curve(y_train_fit,train_prob[:,1])
f1_score_train = 2*precision_train*recall_train/(precision_train+recall_train)
plt.plot(thresholds_train, precision_train[: -1], "g--", label="Precision")
plt.plot(thresholds_train, recall_train[: -1], "b--", label="Recall")
plt.plot(thresholds_train,f1_score_train[: -1],"r--",label="F1Score")
plt.xlabel('thresholds')
plt.ylabel('Precision/Recall')
print(max(f1_score_train))
new_threshold = thresholds_train[np.argmax(f1_score_train)] #defining new threshold for logistic regression which maximises the f1_score
print(new_threshold)

In [None]:
#calculating performance on the test set
test_prob = nnmodel.predict_proba(X_test_fit)
precision_test,recall_test,thresholds_test = precision_recall_curve(y_test_fit,test_prob[:,1])
f1_score_test = 2*precision_test*recall_test/(precision_test+recall_test)
plt.plot(thresholds_test, precision_test[: -1], "g--", label="Precision")
plt.plot(thresholds_test, recall_test[: -1], "b--", label="Recall")
plt.plot(thresholds_test,f1_score_test[: -1],"r--",label="F1Score")
plt.xlabel('thresholds')
plt.ylabel('Precision/Recall')
print(max(f1_score_test))
new_threshold = thresholds_test[np.argmax(f1_score_test)]
print(new_threshold)

In [None]:
#discussing example of using gridsearch to look for optimal hyperparameters, in this case alpha which is a regularization parameter
parameters = {'alpha':10.0 **-np.arange(1,7)}
print(parameters)

In [None]:
clf = GridSearchCV(nnmodel,parameters,cv=5,scoring='f1')

In [None]:
clf.fit(X_train_fit,y_train_fit)

In [None]:
sorted(clf.cv_results_.keys())

In [None]:
clf.cv_results_['mean_test_score']
