In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/facebook-recruiting-iv-human-or-bot/train.csv.zip')
bids_df = pd.read_csv('/kaggle/input/facebook-recruiting-iv-human-or-bot/bids.csv.zip')

In [None]:
train_df.head()

In [None]:
bids_df.head()

In [None]:
train_df['outcome'].value_counts()

In [None]:
train_df.info()

In [None]:
bids_df.info()

## Merge two tables.

In [None]:
train_df.set_index('bidder_id')
bids_df.set_index('bidder_id')
df = pd.merge(train_df, bids_df)
df.info()

In [None]:
df = df.drop('bid_id', axis=1)

## Top 3 sources of bot

In [None]:
mask = df['outcome'] == 1.0
df_bot = df[mask]

In [None]:
import seaborn as sns

sns.countplot(x="country", data=df_bot, 
             order=df.country.value_counts().iloc[:3].index)

In [None]:
sns.countplot(x="ip", data=df_bot, 
             order=df.ip.value_counts().iloc[:3].index)

In [None]:
sns.countplot(x="device", data=df_bot, 
             order=df.device.value_counts().iloc[:3].index)

In [None]:
sns.countplot(x="url", data=df_bot, 
             order=df.url.value_counts().iloc[:3].index)

In [None]:
sns.countplot(x="merchandise", data=df_bot, 
             order=df.merchandise.value_counts().iloc[:3].index)

## Encode labels

In [None]:
target = 'outcome'
categorical_features = []
numeric_features = []
features = df.columns.values.tolist()
for col in features:
    if df[col].dtype != 'object': 
        numeric_features.append(col)
    else:
        categorical_features.append(col)

In [None]:
from sklearn.preprocessing import LabelEncoder
# Encoding categorical features
for col in categorical_features:
    le = LabelEncoder()
    le.fit(list(df[col].astype(str).values))
    df[col] = le.transform(list(df[col].astype(str).values))

## Build Model

In [None]:
y = df['outcome']
X = df.drop('outcome', axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=5, random_state=2)
forest.fit(X_train, y_train)

In [None]:
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train))) 
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

In [None]:
features = X.columns.values.tolist()
importances = forest.feature_importances_
indices = np.argsort(importances)

import matplotlib.pyplot as plt

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
y_pred=forest.predict(X_test)
from sklearn.metrics import confusion_matrix

cm=confusion_matrix(y_test,y_pred)

conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")
