# DRILL PART 3 | DATA MINING : BUILDING PREDCITION MODEL TO DETERMNE THE DESTINATION CITY SELECTED BY USER

This github page is part of 3-series project named DRILL whose main focus is to explore and clean Airbnb dataset and finally building prediction model to predict outcomes of destination country of users. <br>

>Drill part 1 can be found in [airbnb_explore.ipynb]https://github.com/singh-sona/Messy-Data-Cleanning-Projects/blob/master/airbnb_explore.ipynb)

>Drill part 2 can be found in [airbnb_explore.ipynb][https://github.com/singh-sona/Messy-Data-Cleanning-Projects/blob/master/airbnb_clean.ipynb])

## DATA TRANSFORMATION: 

In [None]:
%run airbnb_clean.ipynb

In [None]:
# Home made One Hot Encoding function
def convert_to_binary(df, column_to_convert):
    categories = list(df[column_to_convert].drop_duplicates())

    for category in categories:
        cat_name = str(category).replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_").replace("-", "").lower()
        col_name = column_to_convert[:5] + '_' + cat_name[:10]
        df[col_name] = 0
        df.loc[(df[column_to_convert] == category), col_name] = 1

    return df

# One Hot Encoding
print("One Hot Encoding categorical data...")
columns_to_convert = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']

for column in columns_to_convert:
    df_com = convert_to_binary(df=df_com, column_to_convert=column)
    df_com.drop(column, axis=1, inplace=True)

In [None]:
# Add new data related fields
print("Adding new fields...")
df_com['day_account_created'] = df_com['date_account_created'].dt.weekday
df_com['month_account_created'] = df_com['date_account_created'].dt.month
df_com['quarter_account_created'] = df_com['date_account_created'].dt.quarter
df_com['year_account_created'] = df_com['date_account_created'].dt.year
df_com['hour_first_active'] = df_com['timestamp_first_active'].dt.hour
df_com['day_first_active'] = df_com['timestamp_first_active'].dt.weekday
df_com['month_first_active'] = df_com['timestamp_first_active'].dt.month
df_com['quarter_first_active'] = df_com['timestamp_first_active'].dt.quarter
df_com['year_first_active'] = df_com['timestamp_first_active'].dt.year
df_com['created_less_active'] = (df_com['date_account_created'] - df_com['timestamp_first_active']).dt.days

# Drop unnecessary columns
columns_to_drop = ['date_account_created', 'timestamp_first_active', 'date_first_booking', 'country_destination']
for column in columns_to_drop:
    if column in df_com.columns:
        df_com.drop(column, axis=1, inplace=True)


Although tranformation took two steps and  changed training dataset from 14 columns to 163 columns. Data is expanded by One Hot Encoding, which is not adding more information, but simply expanding out the existing information. No external data has been added.

### Reading and working on remaining file of Sessions.csv

In [None]:
sessions = pd.read_csv("sessions.csv",header=0, index_col=False)

In [None]:
# Determine primary device
print("Determing primary device...")
sessions_device = sessions.loc[:, ['user_id', 'device_type', 'secs_elapsed']]
aggregated_lvl1 = sessions_device.groupby(['user_id', 'device_type'], as_index=False, sort=False).aggregate(np.sum)
idx = aggregated_lvl1.groupby(['user_id'], sort=False)['secs_elapsed'].transform(max) == aggregated_lvl1['secs_elapsed']
df_primary = pd.DataFrame(aggregated_lvl1.loc[idx , ['user_id', 'device_type', 'secs_elapsed']])
df_primary.rename(columns = {'device_type':'primary_device', 'secs_elapsed':'primary_secs'}, inplace=True)
df_primary = convert_to_binary(df=df_primary, column_to_convert='primary_device')
df_primary.drop('primary_device', axis=1, inplace=True)

# Determine Secondary device
print("Determing secondary device...")
remaining = aggregated_lvl1.drop(aggregated_lvl1.index[idx])
idx = remaining.groupby(['user_id'], sort=False)['secs_elapsed'].transform(max) == remaining['secs_elapsed']
df_secondary = pd.DataFrame(remaining.loc[idx , ['user_id', 'device_type', 'secs_elapsed']])
df_secondary.rename(columns = {'device_type':'secondary_device', 'secs_elapsed':'secondary_secs'}, inplace=True)
df_secondary = convert_to_binary(df=df_secondary, column_to_convert='secondary_device')
df_secondary.drop('secondary_device', axis=1, inplace=True)


In [None]:
# Count occurrences of value in a column
def convert_to_counts(df, id_col, column_to_convert):
    id_list = df[id_col].drop_duplicates()
    
    df_counts = df.loc[:,[id_col, column_to_convert]]
    df_counts['count'] = 1
    df_counts = df_counts.groupby(by=[id_col, column_to_convert], as_index=False, sort=False).sum()
    
    new_df = df_counts.pivot(index=id_col, columns=column_to_convert, values='count')
    new_df = new_df.fillna(0)
    
    # Rename Columns
    categories = list(df[column_to_convert].drop_duplicates())
    for category in categories:
        cat_name = str(category).replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_").replace("-", "").lower()
        col_name = column_to_convert + '_' + cat_name
        new_df.rename(columns = {category:col_name}, inplace=True)
        
        return new_df
    
    # Aggregate and combine actions taken columns
print("Aggregating actions taken...")
session_actions = sessions.loc[:,['user_id', 'action', 'action_type', 'action_detail']]
columns_to_convert = ['action', 'action_type', 'action_detail']
session_actions = session_actions.fillna('not provided')
first = True

for column in columns_to_convert:
    print("Converting " + column + " column...")
    current_data = convert_to_counts(df=session_actions, id_col='user_id', column_to_convert=column)

# If first loop, current data becomes existing data, otherwise merge existing and current
if first:
    first = False
    actions_data = current_data
else:
    actions_data = pd.concat([actions_data, current_data], axis=1, join='inner')

> PREVIOUS | Drill part2: [airbnb_explore.ipynb][https://github.com/singh-sona/Messy-Data-Cleanning-Projects/blob/master/airbnb_clean.ipynb]

### BUILDING PREDICTION MODEL:

In [None]:
# Prepare training data for modelling
df_train.set_index('id', inplace=True)
df_train = pd.concat([df_train['country_destination'], df_com], axis=1, join='inner')

id_train = df_train.index.values
labels = df_train['country_destination']
le = LabelEncoder()
y = le.fit_transform(labels)
X = df_train.drop('country_destination', axis=1, inplace=False)

In [None]:
More to come..