In [0]:
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix 

#ignore warnings
import warnings
warnings.filterwarnings('ignore')
import os

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8
%matplotlib inline

# Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

In [0]:
def predictListingType(test_df):
  test_df = test_df.drop(['Unnamed: 0', 'GUID'], axis=1)
  
  test_null = pd.DataFrame(data= test_df.isnull().sum()/len(test_df)*100, 
                    columns=['Percentage of Values Missing'],
                    index=test_df.columns
                   ).reset_index()

  test_null['Percentage of Values Missing'].mean()
  ## Caution - Only 50% percentile missing values are taken.
  Notorious_test_null = test_null[test_null['Percentage of Values Missing'] > test_null['Percentage of Values Missing'].mean()]
  test_null_drop = test_null[test_null['Percentage of Values Missing'] > 85]

  test_col_to_drop = []
  for test_cols in list(test_null_drop['index'].values):
      test_col_to_drop.append(test_cols)
  
  test_df.drop(test_col_to_drop, axis=1, inplace=True)
  test_df["Rating"].fillna(test_df["Rating"].median(), inplace=True)

  # convert from float to int
  test_df['Rating'] = test_df['Rating'].astype(int)
  
  
  test_df.Rating.loc[ (test_df.Rating <= 67282) ]= 0
  test_df.Rating.loc[ (test_df.Rating > 67282) & (test_df.Rating <=  69475) ]= 1
  test_df.Rating.loc[ (test_df.Rating > 69475) & (test_df.Rating <= 71661) ]= 2
  test_df.Rating.loc[ (test_df.Rating > 71661) ]= 3
  
  
  test_df = test_df.dropna(subset=['Number_of_Reviews'])
  # convert from float to int
  test_df['Number_of_Reviews'] = test_df['Number_of_Reviews'].astype(int)
  
  test_df.Number_of_Reviews.loc[ (test_df.Number_of_Reviews <= 4) ]= 0
  test_df.Number_of_Reviews.loc[ (test_df.Number_of_Reviews > 4) ]= 1
  
  
  test_df["Host_Response_Rate"].interpolate(method='linear', axis=0, inplace = True)
  test_df = test_df.dropna(subset=['Host_Response_Rate'])
  test_df['Host_Response_Rate'] = test_df['Host_Response_Rate'].astype(int)
  
  # Assign number to Host_Response_Rate limits
  test_df.Host_Response_Rate.loc[ (test_df.Host_Response_Rate <= 25) ]= 0
  test_df.Host_Response_Rate.loc[ (test_df.Host_Response_Rate > 25) & (test_df.Host_Response_Rate <=  50) ]= 1
  test_df.Host_Response_Rate.loc[ (test_df.Host_Response_Rate > 50) & (test_df.Host_Response_Rate <=  75) ]= 2
  test_df.Host_Response_Rate.loc[ (test_df.Host_Response_Rate > 75) ]= 3
  
  ##Extra_People
  test_df['Extra_People'].fillna(0, inplace = True)
  test_df['Extra_People'] = test_df['Extra_People'].astype(int)
  test_df.Extra_People.loc[ (test_df.Extra_People <= 199.8) ]= 0
  test_df.Extra_People.loc[ (test_df.Extra_People > 199.8) & (test_df.Extra_People <=  399.6) ]= 1
  test_df.Extra_People.loc[ (test_df.Extra_People > 399.6) & (test_df.Extra_People <=  599.4) ]= 2
  test_df.Extra_People.loc[ (test_df.Extra_People > 599.4) & (test_df.Extra_People <=  799.2) ]= 3
  test_df.Extra_People.loc[ (test_df.Extra_People > 799.2) ]= 4
  
  ##Calculated_host_listings_count
  test_df = test_df.drop(['Calculated_host_listings_count', 'Guests_Included'], axis=1)
  test_df["Host_Total_Listings_Count"].fillna(test_df["Host_Total_Listings_Count"].median(), inplace=True)
  test_df['Host_Total_Listings_Count'] = test_df['Host_Total_Listings_Count'].astype(int)
  test_df.Host_Total_Listings_Count.loc[ (test_df.Host_Total_Listings_Count <= 557) ]= 0
  test_df.Host_Total_Listings_Count.loc[ (test_df.Host_Total_Listings_Count > 557) ]= 1
  
  ##Availability 30.60.90
  test_df = test_df.drop(['Availability_60'], axis=1)
  test_df = test_df.drop(['Availability_90'], axis=1)
  test_df.Availability_30.loc[ (test_df.Availability_30 <= 10) ]= 0
  test_df.Availability_30.loc[ (test_df.Availability_30 > 10) & (test_df.Availability_30 <=  20) ]= 1
  test_df.Availability_30.loc[ (test_df.Availability_30 > 20) ]= 2
  
  ##Availability 365
  test_df.Availability_365.loc[ (test_df.Availability_365 <= 73) ]= 0
  test_df.Availability_365.loc[ (test_df.Availability_365 > 73) & (test_df.Availability_365 <=  146) ]= 1
  test_df.Availability_365.loc[ (test_df.Availability_365 > 146) & (test_df.Availability_365 <=  219) ]= 2
  test_df.Availability_365.loc[ (test_df.Availability_365 > 219) & (test_df.Availability_365 <=  292) ]= 3
  test_df.Availability_365.loc[ (test_df.Availability_365 > 292) ]= 4
  
  ##Minimum Nights
  test_df.Minimum_Nights.loc[ (test_df.Minimum_Nights <= 5000.5) ]= 0
  test_df.Minimum_Nights.loc[ (test_df.Minimum_Nights > 5000.5) ]= 1
  
  ##Maxi Nights
  test_df.Maximum_Nights = pd.to_numeric(test_df.Maximum_Nights, errors='coerce')
  test_df.Maximum_Nights.loc[ (test_df.Maximum_Nights <= 715827883.0) ]= 0
  test_df.Maximum_Nights.loc[ (test_df.Maximum_Nights > 715827883.0) & (test_df.Maximum_Nights <=  1431655765.0) ]= 1
  test_df.Maximum_Nights.loc[ (test_df.Maximum_Nights > 1431655765.0) ]= 2
  
  ##Bathrooms
  test_df['Bathrooms'].fillna(1, inplace=True)
  test_df['Bathrooms'] = test_df['Bathrooms'].astype(int)
  
  ##Room Type
  test_df = pd.get_dummies(test_df,columns=['Room_Type'])
  
  ## Beds Bedsrooms
  test_df['Beds'].fillna(1, inplace=True)
  test_df['Bedrooms'].fillna(1, inplace=True)
  test_df['Beds'] = test_df['Beds'].astype(int)
  test_df.Beds.loc[ (test_df.Beds <= 6.333) ]= 0
  test_df.Beds.loc[ (test_df.Beds > 6.333) & (test_df.Beds <=  12.667) ]= 1
  test_df.Beds.loc[ (test_df.Beds > 12.667) ]= 2
  
  ## Cancellation Policy
  # The main categories of Ticket are "strict", "flexible", "moderate".
  # So I will combine "strict", "strict_new", "super_strict_60", "super_strict_30", "super_strict_30_new", "super_strict_60_new","long_term" and "no_refunds" together.
  test_df['Cancellation_Policy'] = test_df['Cancellation_Policy'].replace(["strict_new", "super_strict_60", "super_strict_30", "super_strict_30_new", "super_strict_60_new", "no_refunds"], "strict")
  # Replace "moderate_new" by "moderate".
  test_df['Cancellation_Policy'] = test_df['Cancellation_Policy'].replace(["moderate_new"], "moderate")
  # Replace "flexible_new" by "flexible" 
  test_df['Cancellation_Policy'] = test_df['Cancellation_Policy'].replace(["flexible_new"], "moderate")
  test_df = pd.get_dummies(test_df,columns=['Cancellation_Policy'])
  
  ##Experience offered
  test_df = pd.get_dummies(test_df,columns=['Experiences_Offered'])
  
  ## Bed Type
  test_df = pd.get_dummies(test_df,columns=['Bed_Type'])
  
  ## Amenities
  test_df = test_df.dropna(subset=['Amenities'])
  test_df = test_df.reset_index(drop = True)
  test_df['Amenities'] = test_df['Amenities'].map(
      lambda amns: "|".join([amn.replace("}", "").replace("{", "").replace('"', "")\
                             for amn in amns.split(",")]))
  np.concatenate(test_df['Amenities'].map(lambda amns: amns.split("|")).values)
  amenities = np.unique(np.concatenate(test_df['Amenities'].map(lambda amns: amns.split("|")).values))
  amenities_matrix = np.array([test_df['Amenities'].map(lambda amns: amn in amns).values for amn in amenities]
  test_df['Amenities'].map(lambda amns: amns.split("|")).head()
  np.unique(np.concatenate(test_df['Amenities'].map(lambda amns: amns.split("|"))))[1:]
  amenities = np.unique(np.concatenate(test_df['Amenities'].map(lambda amns: amns.split("|"))))[1:]
  amenity_arr = np.array([test_df['Amenities'].map(lambda amns: amn in amns) for amn in amenities])
  features = test_df[['Rating', 'Host_Response_Rate', 'Bathrooms', 'Bedrooms', 'Beds', 'Maximum_Nights', 'Minimum_Nights', 'Availability_30', 'Availability_365', 'Extra_People',
                      'Room_Type_Entire home/apt', 'Room_Type_Private room', 'Room_Type_Shared room', 'Number_of_Reviews', 'Cancellation_Policy_flexible', 'Cancellation_Policy_moderate',
                      'Cancellation_Policy_strict', 'Experiences_Offered_business', 'Experiences_Offered_family', 'Experiences_Offered_none', 'Experiences_Offered_romantic', 
                       'Experiences_Offered_social', 'Bed_Type_Airbed', 'Bed_Type_Couch', 'Bed_Type_Futon', 'Bed_Type_Pull-out Sofa', 'Bed_Type_Real Bed']]
  
  features = pd.concat([features, pd.DataFrame(data=amenity_arr.T, columns=amenities)], axis=1)
  X_test = features
  sc_X = StandardScaler()
  X_test = sc_X.transform(X_test)
  model_ = pickle.load(open(".pickle.dat","rb"))
  y_pred = model_.predict(X_test)
  
  #concatenate y_pred with test_df
  
return test_df


In [0]:
if __name__ == "__main__":
  zf2 = zipfile.ZipFile('/content/gdrive/My Drive/PeopleInteractive/listings_test_send.zip')
  test_df = pd.read_csv(zf2.open("listings_test_send.csv"), sep=";") 
  output = predictListingType(test_df)
  print(output)
  output.to_csv("output.csv", sep=",")