In [116]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import os

In [117]:
os.chdir('/Users/Sean/Desktop/DS1003_Final_Project/data')

In [107]:
review = pd.read_csv('reviews_all.csv',encoding = "ISO-8859-1")
listings = pd.read_csv('listings_all.csv',encoding = "ISO-8859-1")

### drop na column where missing greater than 50%

In [108]:
listings_na_cols = listings.columns[pd.isnull(listings).sum()/len(listings) > 0.5].tolist()
review_na_cols = review.columns[pd.isnull(review).sum()/len(review) > 0.5].tolist()

In [109]:
listings_na_cols

['notes',
 'host_acceptance_rate',
 'square_feet',
 'weekly_price',
 'monthly_price',
 'security_deposit',
 'has_availability',
 'license',
 'jurisdiction_names']

In [110]:
listings.drop(listings_na_cols,inplace=True,axis=1)

In [111]:
listings.shape

(40227, 86)

In [8]:
# drop all the 
# listings = listings.dropna()

In [113]:
listings.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url',
       'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url',
       'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_is_superhost',
       'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'street', 'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenit

In [125]:
listings['price'] = listings['price'].apply(lambda x: 0 if x <= 50 else 1 if 50<x<=100 else 2)

In [119]:
cat_cols = ['host_response_time','host_is_superhost','host_has_profile_pic','host_identity_verified','neighbourhood_group_cleansed',
            'zipcode','property_type','room_type','bed_type','instant_bookable']
text_cols = ['summary','name','space','description','neighborhood_overview','transit','access','interaction', 'house_rules',
            'host_about','host_verifications', 'neighbourhood_cleansed','amenities']
num_cols = ['host_response_rate','host_listings_count','host_total_listings_count','accommodates','bathrooms','bedrooms',
           'beds','guests_included', 'minimum_nights','number_of_reviews', 'review_scores_rating', 'reviews_per_month']
Y = ['price']

In [126]:
clean_data = listings[text_cols+cat_cols+num_cols+Y]

In [127]:
clean_data.shape

(40227, 36)

In [128]:
# clean_data_na_cols = listings.columns[pd.isnull(listings).sum()/len(listings) > 0].tolist()
missing_list = []
for i in range(len(clean_data.columns)):
    missing_perc = float(pd.isnull(clean_data[[i]]).sum()/len(clean_data[[i]]))
    missing = [missing_perc, clean_data.columns[i]]
    missing_list.append(missing)

In [129]:
sorted(missing_list)

[[0.0, 'accommodates'],
 [0.0, 'amenities'],
 [0.0, 'bed_type'],
 [0.0, 'guests_included'],
 [0.0, 'host_verifications'],
 [0.0, 'instant_bookable'],
 [0.0, 'minimum_nights'],
 [0.0, 'name'],
 [0.0, 'neighbourhood_cleansed'],
 [0.0, 'neighbourhood_group_cleansed'],
 [0.0, 'number_of_reviews'],
 [0.0, 'price'],
 [0.0, 'property_type'],
 [0.0, 'room_type'],
 [0.0002485892559723569, 'description'],
 [0.0007706266935143063, 'host_has_profile_pic'],
 [0.0007706266935143063, 'host_identity_verified'],
 [0.0007706266935143063, 'host_is_superhost'],
 [0.0007706266935143063, 'host_listings_count'],
 [0.0007706266935143063, 'host_total_listings_count'],
 [0.0014169587590424342, 'bedrooms'],
 [0.0017152658662092624, 'beds'],
 [0.004623760161085838, 'bathrooms'],
 [0.014517612548785641, 'zipcode'],
 [0.03972456310438263, 'summary'],
 [0.22564446764610835, 'reviews_per_month'],
 [0.2437914833320904, 'review_scores_rating'],
 [0.2604469634822383, 'host_response_rate'],
 [0.2604469634822383, 'host_re

In [124]:
# build my feature
def extract_features(feature_df,cat_cols,text_cols,num_cols):
    
    # Encode text features
    for text_col in text_cols:
        tfidf_vec = TfidfVectorizer(stop_words="english", max_df=80, min_df=5, ngram_range=[1,1])
        lda = LatentDirichletAllocation()
        tfidf_tokens = tfidf_vec.fit_transform(feature_df[text_col])
        lda_res = lda.fit_transform(tfidf_tokens)
        topics = text_col + 'topic'
        feature_df[topics] = np.argmax(lda_res, axis=1)
        feature_df = feature_df.drop([text_col], 1)
    
    # Encode categorical features
    for cat_col in cat_cols:
        all_unique_val = np.unique(feature_df[cat_col])
        for val in all_unique_val:
            feature_df["{0}={1}".format(cat_col,val)] = feature_df.apply(lambda x: x[cat_col]==val,1)
        feature_df = feature_df.drop(cat_col,1)

    # Encode Label
    label_encoder = LabelEncoder()
    label_encoder.fit(feature_df['price'])
    feature_df['Y'] = label_encoder.transform(feature_df['price'])
    encoded_df = feature_df.drop(['price'],1)
        
    return encoded_df

In [16]:
encoded_data = extract_features(clean_data,cat_cols,text_cols,num_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [36]:
encoded_data.shape

(5458, 228)

In [17]:
# divide the dataset to train dataset and test data set

def prepare_train_test_set(dataset, label="Y", test_ratio=0.3):

    # parition the dataset randomly 
    drop_cols = [label]
    x = dataset.drop(drop_cols, 1).as_matrix()        
    y = dataset[label].as_matrix()
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_ratio)
    
    return [x_train, x_test, y_train, y_test]

In [18]:
def quick_test_model(x_train, x_test, y_train, y_test, model, eval_metrics):

    all_labels = np.unique(np.concatenate([y_train,y_test]))
    model.fit(x_train, y_train)
    pred_train = model.predict(x_train)
    pred_test = model.predict(x_test)
    train_loss = eval_metrics(y_train,pred_train)
    test_loss = eval_metrics(y_test,pred_test)
    
    return [train_loss, test_loss]

In [19]:
x_train, x_test, y_train, y_test = prepare_train_test_set(encoded_data)

In [20]:
# using random forest model
rf = RandomForestClassifier() 

# grid search for the best fit parameters 
param_grid = {
    'min_samples_split': [10,50],
    'n_estimators': [50, 100],
    
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth':[10,50],
    'min_samples_leaf':[5,100]    
}
CV_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv= 5)
CV_rf.fit(x_train, y_train)
print (CV_rf.best_params_)

{'max_depth': 50, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 50}


In [21]:
# using the best fit parameters to train the model
rf_tuned_para = RandomForestClassifier(max_features ='auto', min_samples_leaf =5, n_estimators =50, min_samples_split =10, 
                                max_depth =50)
# Run Model
rf_tuned_para.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [22]:
print(quick_test_model(x_train, x_test, y_train, y_test, rf_tuned_para, accuracy_score))

[0.81753926701570678, 0.7649572649572649]


In [None]:
# three class [0.81753926701570678, 0.7649572649572649]