#### Import Required Library

In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import * 
from pyspark.ml.feature import StringIndexer ,OneHotEncoder,VectorAssembler
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
from pyspark.ml.classification import RandomForestClassifier ,DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType

In [None]:
spark = SparkSession \
    .builder \
    .appName('Airbnb') \
    .getOrCreate()

#### Import Sessions Data

In [None]:
!unzip ../input/airbnb-recruiting-new-user-bookings/sessions.csv.zip -d ./
SessionData = spark.read\
            .format('csv')\
            .option('header', 'true')\
            .load('./sessions.csv')

In [None]:
SessionData.limit(3).toPandas()

In [None]:
SessionData.printSchema()

#### Drop form Sessions Data Columns (Action Detail,Device Type)
#### Convert Datatype of Column Secs Elapsed from String to Double

In [None]:
SessionData = SessionData.drop('action_detail','device_type')
SessionData = SessionData.withColumn("secs_elapsed", SessionData["secs_elapsed"].cast(DoubleType()))

#### Checke all column for Unknown value and priniting the count of it , If Unknown exist replace the values to null

In [None]:
replace_Unknown = udf(lambda value: None if value=="-unknown-" else value, StringType())
for thisCol in SessionData.columns :
    if dict(SessionData.dtypes)[thisCol] in ("string") :
        UnknowCount = SessionData.where(col(thisCol) == '-unknown-').count()
        print('The Count of Unknown in Column %s ' %thisCol , UnknowCount )
        if UnknowCount > 0 :
            SessionData = SessionData.withColumn(thisCol, replace_Unknown(SessionData[thisCol]))
            print('The Count of Unknown in Column %s After Converting to null' %thisCol , SessionData.where(col(thisCol) == '-unknown-').count() )

#### Printing the count of null values in each column of session dataset

In [None]:
for thisCol in SessionData.columns :
    print('The Count of Null in Column %s ' %thisCol , SessionData.where(col(thisCol).isNull()).count() )

#### Drop Null values from column (User Id,Action) in Sessions Dataset
#### Fill Null values in Column Action Type with 'Other' value
#### Fill Null values in Column Secs Elapsed with the Median of Secs Elapsed

In [None]:
SessionData=SessionData.na.drop(subset=['user_id','action'])
SessionData=SessionData.na.fill(value='Other',subset=['action_type'])
median_secs = SessionData.approxQuantile("secs_elapsed", [0.5], 0.25)
SessionData=SessionData.na.fill(value= median_secs[0] ,subset=['secs_elapsed'])

#### Printing the count of null values in each column of session dataset after cleansing process

In [None]:
for thisCol in SessionData.columns :
    print('The Count of Null in Column %s ' %thisCol , SessionData.where(col(thisCol).isNull()).count() )

#### Import Trrain User Dataset

In [None]:
!unzip ../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip -d ./
TrainData = spark.read\
            .format('csv')\
            .option('header', 'true')\
            .load('./train_users_2.csv')

In [None]:
TrainData.limit(3).toPandas()

In [None]:
TrainData.printSchema()

#### Checke all column for Unknown value and priniting the count of it 

In [None]:
for thisCol in TrainData.columns :
    if dict(TrainData.dtypes)[thisCol] in ("string") :
        UnknowCount = TrainData.where(col(thisCol) == '-unknown-').count()
        print('The Count of Unknown in Column %s ' %thisCol , UnknowCount )

#### Printing the count of null values in each column of Train User dataset

In [None]:
for thisCol in TrainData.columns :
    print('The Count of Null in Column %s ' %thisCol , TrainData.where(col(thisCol).isNull()).count() )

#### Split each column of (Timestamp First Active , Date Account Created , Date First Booking) to three column Day , Month and Year
#### Convert Datatype of Columns Age and Signup Flow from String to Integer
#### Replace the Other values in Gender to Null
#### Replace the Unknown values in Gender to Null

In [None]:
replace_Other = udf(lambda value: None if value=="OTHER" else value, StringType())
     
TrainData = TrainData.withColumn('day_first_active' ,  substring('timestamp_first_active', 7,2).cast(IntegerType()))\
                    .withColumn('month_first_active' ,  substring('timestamp_first_active', 5,2).cast(IntegerType()))\
                    .withColumn('year_first_active' ,  substring('timestamp_first_active', 0,4).cast(IntegerType()))\
                    .withColumn('day_account_created' ,  substring('date_account_created', 9,2).cast(IntegerType()))\
                    .withColumn('month_account_created' ,  substring('date_account_created', 6,2).cast(IntegerType()))\
                    .withColumn('year_account_created' ,  substring('date_account_created', 0,4).cast(IntegerType()))\
                    .withColumn('day_first_booking' ,  substring('date_first_booking', 9,2).cast(IntegerType()))\
                    .withColumn('month_first_booking' ,  substring('date_first_booking', 6,2).cast(IntegerType()))\
                    .withColumn('year_first_booking' ,  substring('date_first_booking', 0,4).cast(IntegerType()))\
                    .withColumn('age' ,  TrainData['age'].cast(DoubleType()))\
                    .withColumn('signup_flow' ,  TrainData['signup_flow'].cast(IntegerType()))\
                    .withColumn('gender', replace_Other(TrainData['gender']))

TrainData = TrainData.withColumn('gender', replace_Unknown(TrainData['gender']))

#### Drop form Train Users Dataset Columns (Date First Booking,Date Account Created,Timestamp First Active,First Device Type,First Browser)

In [None]:
TrainData= TrainData.drop('date_first_booking','date_account_created','timestamp_first_active','first_device_type','first_browser')

#### Age of user should be betrween 10 to 120 , So I have replaced all values that out of this range with the Median of Age
#### Replace the Null values in Age with the Median

In [None]:
#Age
TempTrainData = TrainData.filter( ((TrainData.age > 10) & (TrainData.age < 120) ))
median_age = TempTrainData.approxQuantile("age", [0.5], 0.25)
TrainData=TrainData.na.fill(value= median_age[0] ,subset=['age'])

replace_age = udf(lambda value: int(median_age[0]) if ( int(value)<10 or int(value)>120 )else int(value), IntegerType())
TrainData = TrainData.withColumn('age', replace_age(TrainData['age']))

#### Replace the Null values of Gender with the most frequent Value in the Column

In [None]:
#Gender
MostFrequentGender=TrainData.filter(col('gender').isin('FEMALE' ,'MALE'))\
        .groupby('gender').count().sort(col("count").desc())\
        .first()['gender']

TrainData=TrainData.na.fill(value=MostFrequentGender,subset=['gender'])

#### Join Datasets (Sessions , Train User) using th key User ID
#### Drop one key from the new Dataframe

In [None]:
TrainUserDF = SessionData.join( TrainData, SessionData['user_id']==TrainData['id'] )
TrainUserDF= TrainUserDF.drop('id')

In [None]:
TrainUserDF.limit(3).toPandas()

#### Printing the number of Columns and Rows of the Dataframes ( Sessions , Train User , Joined Data Frame)

In [None]:
print("SessionData",(SessionData.count(), len(SessionData.columns)))
print("TrainData",(TrainData.count(), len(TrainData.columns)))
print("TrainUserDF",(TrainUserDF.count(), len(TrainUserDF.columns)))

#### Retrive the Categorical Column in one list

In [None]:
CategoricalColumn = [item[0] for item in TrainUserDF.dtypes if item[1].startswith('string')] 
CategoricalColumn

#### Retrive the Numerical Column in one list

In [None]:
NumericalColumn = [item[0] for item in TrainUserDF.dtypes if item[1].startswith('int') | item[1].startswith('double')] 
NumericalColumn

#### Create list of required Features that will be used in the classification

In [None]:
requiredFeatures = ['action_encoded',
                    'action_type_encoded',
                    'secs_elapsed',
                    'gender_encoded',
                    'age',
                    'signup_method_encoded',
                    'signup_flow',
                    'affiliate_channel_encoded',
                    'affiliate_provider_encoded',
                    'first_affiliate_tracked_encoded',
                    'signup_app_encoded',
                    'day_first_active',
                    'month_first_active',
                    'year_first_active',
                    'day_account_created',
                    'month_account_created',
                    'year_account_created',
                    'day_first_booking',
                    'month_first_booking',
                    'year_first_booking']
#requiredFeatures

#### Create an array of StringIndexers to convert the Categorical values to indices except the label value
#### Create an array of OneHotEncoders to encode the Categorical values except the label value

In [None]:
indexers = [StringIndexer(
    inputCol=column, 
    outputCol=column + '_index', 
    handleInvalid='skip') for column in CategoricalColumn if column not in ['country_destination']]

encoders = [OneHotEncoder(
    inputCol=column + '_index', 
    outputCol= column + '_encoded') for column in CategoricalColumn if column not in ['country_destination']]


#### Create a seprated array of StringIndexers to convert the label value to indices except the label value

In [None]:
labelIndexer = [StringIndexer(inputCol='country_destination', outputCol='country_destination_index')]

#### Create a VectorAssembler that transform the combines a given list of columns into a single vector column

In [None]:
assembler = VectorAssembler(inputCols=requiredFeatures, outputCol='features' , handleInvalid = "skip")

####  Split the data into training and test sets

In [None]:
(trainingData, testData) = TrainUserDF.randomSplit([0.8,0.2])

#### Specify First Estimator Classifier (Random Forest Classifier)
#### Create Pipeline to be used to build the Model contains all the Transformers and ends with the Estimator

In [None]:
rf = RandomForestClassifier(labelCol='country_destination_index', featuresCol='features')
RfcPipeline = Pipeline(stages=indexers + encoders + labelIndexer + [assembler, rf])

#### Train the model using Splited Training Data
#### Use the Test Data for Predictions

In [None]:
RfcModel = RfcPipeline.fit(trainingData)
RfcPredictions = RfcModel.transform(testData)

#### Create an Evaluator for our model
#### Check the Accuracy and Test Error of the Model

In [None]:
RfcEvaluator = MulticlassClassificationEvaluator(labelCol='country_destination_index', predictionCol='prediction',  metricName='accuracy')
RfcAccuracy = RfcEvaluator.evaluate(RfcPredictions)

print("Accuracy = %s" % (RfcAccuracy))
print("Test Error = %s" % (1.0 - RfcAccuracy))

#### Create the Confusion Matrix of the Predication

In [None]:
preds_and_labels = RfcPredictions.select(['prediction','country_destination_index']).withColumn('country_destination_index', F.col('country_destination_index').cast(FloatType())).orderBy('prediction')
preds_and_labels = preds_and_labels.select(['prediction','country_destination_index'])
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
print(metrics.confusionMatrix().toArray())

#### Specify Second Estimator Classifier (Decision Tree Classifier)
#### Create Pipeline to be used to build the Model contains all the Transformers and ends with the Estimator

In [None]:
dt = DecisionTreeClassifier(labelCol='country_destination_index', featuresCol='features')
DtcPipeline = Pipeline(stages=indexers + encoders + labelIndexer + [assembler, dt])

#### Train the model using Splited Training Data
#### Use the Test Data for Predictions

In [None]:
DtcModel = DtcPipeline.fit(trainingData)
DtcPredictions = DtcModel.transform(testData)

#### Create an Evaluator for our model
#### Check the Accuracy and Test Error of the Model

In [None]:
DtcEvaluator = MulticlassClassificationEvaluator(labelCol='country_destination_index', predictionCol='prediction',  metricName='accuracy')
DtcAccuracy = DtcEvaluator.evaluate(DtcPredictions)

print("Accuracy = %s" % (DtcAccuracy))
print("Test Error = %s" % (1.0 - DtcAccuracy))

#### Create the Confusion Matrix of the Predication

In [None]:
preds_and_labels = DtcPredictions.select(['prediction','country_destination_index']).withColumn('country_destination_index', F.col('country_destination_index').cast(FloatType())).orderBy('prediction')
preds_and_labels = preds_and_labels.select(['prediction','country_destination_index'])
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
print(metrics.confusionMatrix().toArray())

#### Conclusion
##### Random Forest Classifier Accuracy is  67.5 %
##### Decision Tree Classifier Accuracy is  67.65 %
##### Decision Tree Classifier Accuracy is quiter higher than Random Forest Classifier Accuracy


###### I noticed that we have Unbalanced/unWeighted labels problem that should be fixed in the future work