In [None]:
#importing neccessary libraries
import os
import pandas as pd
import numpy as np
import numpy as nan
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


In [None]:
#Load the Data
train = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip')
test = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/test_users.csv.zip')

In [None]:
labels = train.country_destination.values

In [None]:
#Combine train and test data as some data is missing in train users as weibo in signup_method
train_data = pd.concat((train, test), axis = 0, ignore_index = True)

# Data exploration

In [None]:
train_data.shape

In [None]:
train_data.info()

In [None]:
#view all the data cells
train_data

In [None]:
#view the first five rows of the data
train_data.head()

In [None]:
train_data['signup_method'].value_counts()

In [None]:
train_data['gender'].value_counts()

**Data Visualization**

In [None]:
train_data.signup_method.value_counts(dropna = False).plot(kind = 'bar')

In [None]:
plt.figure(figsize=(13,6))
order1 = train_data['gender'].value_counts().index
sns.countplot(data = train_data, x = 'gender', order = order1, color = sns.color_palette()[5])
plt.xlabel('Gender')
plt.ylabel('Count')
order2 = train_data['gender'].value_counts()

for i in range(order2.shape[0]):
    count = order2[i]
    strt ='{:0.1f}%'.format(100*count / train_data.shape[0])
    plt.text(i,count+1000,strt,ha='center')

In [None]:
train_data['gender'].value_counts()
train_data.gender.value_counts(dropna = False).plot(kind = 'bar')

#some data seems to be wrong like unknown as we can't decide the gender so we will clean the data 

This plot shows that:
-The greatest portion of genders is unknown 
-The females count is greater than the males
-Other count cannot be determined

In [None]:
sns.boxplot(x = 'gender', y = 'age' ,color = sns.color_palette()[7], data = train_data)
#the box plot has a very wide range due to the ages above 2000

In [None]:
sns.scatterplot(x = 'signup_flow', y = 'age', data = train_data )
sns.set(rc={'figure.figsize':(15,10)})
#The scatter plot has a very wide range and considered to be bad as the age of most people is less than 100
#Although a little few has more than 2000 which means that these ages may be wrong

In [None]:
sns.boxplot(x = 'signup_app', y = 'signup_flow' ,color = sns.color_palette()[7], data = train_data)
sns.set(rc={'figure.figsize':(10,8)})


The plot shows that the most apps involved in signing up flow is Android and Moweb followed by the web then IOS

In [None]:
#replace the unknown values with Nan
train_data.gender.replace('-unknown-',np.nan,inplace = True)

In [None]:
sns.lineplot(x = 'gender', y = 'signup_flow' ,color = sns.color_palette()[6], data = train_data)
#the line graph shows that the highest signup flow was done by unkown gender so the unknown will be removed

kkkddpd

In [None]:
#We dropped the unknown data as it has no value
train_data['gender'].value_counts()
train_data.gender.value_counts(dropna = True).plot(kind = 'bar')

This plot shows that:
-The greatest portion of genders is females 
-The other count is the lowest
-After removing the unknown values the other value became more clarified

In [None]:
sns.scatterplot(x = 'signup_method', y = 'age', data = train_data )
sns.set(rc={'figure.figsize':(15,10)})
#The scatter plot has a very wide range and considered to be bad as the age of most people is less than 100
#Although a little few has more than 2000 which means that these ages may be wrong

In [None]:
sns.barplot(x = 'first_device_type', y = 'signup_flow', data = train_data)
sns.set(rc={'figure.figsize':(18,9)})
#signup flow of iphone devices is way bigger than all other types as its signup flow is about 18 followed by Android
#Windows desktop needs to be improved as it has very low signup method

In [None]:
for i in train_data.columns:
    ab = train_data[i].isnull().sum()
    if ab != 0:
        print(i + " has {} null values.".format(ab))
        print()

In [None]:
print("count of NULL values before imputation\n")
train_data.isnull().sum()

## Data cleaning

## 1. Age

Restrict the data between 14 and 105 years old and set the rest as null

In [None]:
train_data.loc[~train_data['age'].between(14,105),'age'] = np.nan

In [None]:
#The data here is clearer as we exclude the people above 105 and below 14 years old and the unknown genders
sns.boxplot(x = 'gender', y = 'age' ,color = sns.color_palette()[5], data = train_data)

In [None]:
sns.scatterplot(x = 'signup_method', y = 'age', data = train_data )
sns.set(rc={'figure.figsize':(15,10)})
#The scatter plot here seems more clarifying as age is restricted to 105

Next, fill the null values with the mean

In [None]:
train_data['age'] = train_data['age'].fillna(train_data['age'].mean())

In [None]:
train_data['age'].isna().sum()

In [None]:
train_data.describe()

Finally, we normalize the data to deal with outliers

In [None]:
from sklearn.preprocessing import normalize
X = train_data['age'] 
X = normalize(X.values.reshape(-1,1),axis = 0)

In [None]:
X

In [None]:
#train_data['age'] = X

## Gender

In [None]:
train_data['gender'].value_counts()

In [None]:
train_data['gender'].isna().sum()

Fill the null values with a new category 'unknown'

In [None]:
train_data['gender'] = train_data['gender'].fillna('unknown')

## Date_first_booking, ID, timestamp_first_active

Contains too many null values. I don't think this column would affect the prediction anyway so it will be dropped

In [None]:
train_data = train_data.drop(['id','date_first_booking','timestamp_first_active','date_account_created'], axis = 1)

## First_affiliate_tracked

Fill null values with mode

In [None]:
train_data['first_affiliate_tracked'] = train_data['first_affiliate_tracked'].fillna(train_data['first_affiliate_tracked'].mode()[0])

In [None]:
print("count of NULL values after imputation\n")
train_data.isnull().sum()

The 62096 null values are from the test data we concatenated earlier

# Encoding categorical data

In [None]:
categorical = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel',
             'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']

for feature in categorical:
    encoded = pd.get_dummies(train_data[feature],prefix = feature)
    train_data = train_data.drop([feature],axis = 1)
    train_data = pd.concat((train_data,encoded),axis = 1)

In [None]:
train_data.head()

# Preparing data

## Split the test data

In [None]:
test_data = train_data[:213451]
test_data.drop(['country_destination'],axis = 1,inplace = True)
test_data

In [None]:
train_data = train_data[:213451]

In [None]:
#scaling the data
scaler = StandardScaler()
scaler.fit(train)
train = scaler.transform(train)
validate = scaler.transform(validate)
test = scaler.transform(test)

# Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
label_enc = LabelEncoder()
y = train_data['country_destination']
X = train_data.drop(columns = ['country_destination'], axis=1)
y = label_enc.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 4, stratify = y)

## KNN

In [None]:
from sklearn.metrics import accuracy_score
n = range(3,15)
scores = {}
for ne in n:
    neigh = KNeighborsClassifier(ne)
    neigh.fit(X_train, y_train)
    predictions = neigh.predict(X_test)
    accuracy = accuracy_score(y_test,predictions)
    scores[ne] = accuracy

    neigh = KNeighborsClassifier(12)
    neigh.fit(X_train, y_train)
    predictions = neigh.predict(X_test)
    accuracy = accuracy_score(y_test,predictions)

In [None]:
scores

A maximum score of 54% is reached at k = 12

## XGB

In [None]:
from xgboost.sklearn import XGBClassifier

xgb = XGBClassifier(max_depth=12, learning_rate=0.2, n_estimators=25,
                    objective='multi:softmax', subsample=0.5, colsample_bytree=0.5, seed=0)                  
xgb.fit(X_train, y_train)

pred = xgb.predict(X_test)

accuracy = accuracy_score(y_test,pred)

In [None]:
accuracy