In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,r2_score
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
%matplotlib inline
pd.options.display.max_columns = None
pd.set_option('display.max_colwidth', None)
pd.set_option('float_format', '{:f}'.format)

In [None]:
# feature_cols = user_merged.columns.drop(['business_id', 'name', 'address', 'city', 'state_left', 'postal_code',
# #                                     'census_tract', 'median_income', 'review_id', 'review_count_y', 'user__average_stars',
#                                         'user_id', 'review_stars', 'text', 'date', 'review_count_y', 'user__average_stars',
#                                         'rating_score','income_range'])

In [None]:
yelp_compiled = pd.read_csv('Yelp_data/yelp_compiled.csv')



In [None]:
#getting median income range
q3 =yelp_compiled['median_income'].quantile(.75)
q2 =yelp_compiled['median_income'].quantile(.50)
q1 =yelp_compiled['median_income'].quantile(.25)
def income_classifier (x):
    if x >= q3:
        return "2"
    if x <= q1:
        return "0"
    else:
        return "1"

yelp_compiled['income_range'] = yelp_compiled['median_income'].apply(income_classifier)
#yelp_compiled.head()

In [None]:
#rating score
def rating_scorer (x):
    if x> 3:
        return 1
    if x< 3:
        return 3
    else:
        return 2
yelp_compiled['rating_score'] = yelp_compiled['stars'].apply(rating_scorer)

In [None]:
#getting weighted_star
yelp_compiled['weighted_star'] = yelp_compiled['review_count']*yelp_compiled['stars']

In [None]:
list(yelp_compiled.columns)

In [None]:
plt.figure(figsize=(10,7))
sns.distplot(yelp_compiled.median_income)
plt.title("Median Income Distribution", fontsize=15)
plt.xlabel('Median Income', fontsize =15);

In [None]:
yelp_compiled.median_income.describe()

In [None]:
#data =yelp_compiled.income_range.value_counts()
plt.figure(figsize=(10,7))
sns.countplot(x='income_range', data=yelp_compiled)
plt.title("Median Income Ranges", fontsize=15)
plt.xlabel("Income Range", fontsize = 15)
plt.ylabel("Count",fontsize =15);


In [None]:
#Distribution of stars
g = sns.catplot(x="stars", col="income_range",
                data=yelp_compiled, kind="count",
                height= 5, aspect=.9);

In [None]:
#top_reviewed_high = yelp_compiled[yelp_compiled['income_range']==2
Categories={}
for cat in yelp_compiled.new_categories.values:
    all_categories= cat.split(",")
    for x in all_categories:
        try :
            Categories[x] =Categories[x]+1
        except:
            Categories[x]=1
top_categories = pd.DataFrame.from_dict(data= Categories,orient="index")
top_categories.reset_index(inplace=True)
top_categories.columns = ['Category', 'Count']

In [None]:
#stripping white space in front of words in column and getting dummy
top_categories['Category'] = top_categories['Category'].str.lstrip()
#yelp_compiled["new_categories"] = yelp_compiled['new_categories'].str.lstrip()

In [None]:
#stripping white space in front of words in column and getting dummy
top_categories['Category'] = top_categories['Category'].str.lstrip()
#plotting
categories_sorted =top_categories.sort_values(by ='Count', ascending =False).head(20)
plt.figure(figsize=(12,8))
sns.barplot(x='Category',y='Count',data= categories_sorted) 
plt.title("Top Ten Restaurant Categories ", fontsize=15)
plt.xlabel("")
plt.ylabel("Count",fontsize =15)
plt.xticks(
    rotation= 90, 
    horizontalalignment='center',
    fontweight='light',
    fontsize='x-large')
plt.tight_layout();
# top_categories.sort_values(by ='Count', ascending =False).head(20).plot.bar(x='Category', y ='Count');

In [None]:
# income_high.to_csv('Yelp_data/income_high', index=False)

In [None]:
income_high = yelp_compiled[yelp_compiled['income_range']=='2']
Categories={}
for cat in income_high.new_categories.values:
    all_categories= cat.split(",")
    for x in all_categories:
        try :
            Categories[x] =Categories[x]+1
        except:
            Categories[x]=1
categories_high = pd.DataFrame.from_dict(data= Categories,orient="index")
categories_high.reset_index(inplace=True)
categories_high.columns = ['Category', 'Count']


In [None]:
#stripping white space in front of words in column and getting dummy
categories_high['Category'] = categories_high['Category'].str.lstrip()
#plotting
categories_sorted_h =categories_high.sort_values(by ='Count', ascending =False).head(20)
plt.figure(figsize=(12,8))
sns.barplot(x='Category',y='Count',data= categories_sorted_h) 
plt.title("Top Ten Restaurant Categories in High Income Group ", fontsize=15)
plt.xlabel("")
plt.ylabel("Count",fontsize =15)
plt.xticks(
    rotation= 90, 
    horizontalalignment='center',
    fontweight='light',
    fontsize='x-large')
plt.tight_layout();
# top_categories.sort_values(by ='Count', ascending =False).head(20).plot.bar(x='Category', y ='Count');

In [None]:
income_low = yelp_compiled[yelp_compiled['income_range']=='0']
#income_low.to_csv('Yelp_data/income_low', index=False)
Categories={}
for cat in income_low.new_categories.values:
    all_categories= cat.split(",")
    for x in all_categories:
        try :
            Categories[x] =Categories[x]+1
        except:
            Categories[x]=1
categories_low = pd.DataFrame.from_dict(data= Categories,orient="index")
categories_low.reset_index(inplace=True)
categories_low.columns = ['Category', 'Count']
#categories_low.sort_values(by ='occurance', ascending =False).head(20).plot.bar(x='category', y ='occurance');

In [None]:
#stripping white space in front of words in column and getting dummy
categories_low['Category'] = categories_low['Category'].str.lstrip()
#plotting
categories_sorted_l =categories_low.sort_values(by ='Count', ascending =False).head(20)
plt.figure(figsize=(12,8))
sns.barplot(x='Category',y='Count',data= categories_sorted_l) 
plt.title("Top Ten Restaurant Categories in Low Income Group", fontsize=15)
plt.xlabel("")
plt.ylabel("Count",fontsize =15)
plt.xticks(
    rotation= 90, 
    horizontalalignment='center',
    fontweight='light',
    fontsize='x-large')
plt.tight_layout();

In [None]:
top_reviewed_cat =yelp_compiled.groupby(['income_range', 'name']).apply(pd.DataFrame.sort_values, 'stars')

In [None]:
income_low_res =income_low.sort_values(by = ['review_count','stars'], ascending =False).head(20)
plt.figure(figsize=(12,8))
sns.barplot(x='name',y= 'stars',data= income_low_res)
plt.title("Highly Reviewed Restaurant in Low Income ", fontsize=15)
plt.xlabel("")
plt.ylabel("stars",fontsize =15)
plt.xticks(
    rotation= 90, 
    horizontalalignment='center',
    fontweight='light',
    fontsize='x-large')
plt.tight_layout();

In [None]:
income_high_res =income_high.sort_values(by = ['review_count','stars'], ascending =False).head(20)
plt.figure(figsize=(12,8))
sns.barplot(x='name',y= 'stars',data= income_high_res)
plt.title("Highly Reviewed Restaurant in High Income ", fontsize=15)
plt.xlabel("")
plt.ylabel("stars",fontsize =15)
plt.xticks(
    rotation= 90, 
    horizontalalignment='center',
    fontweight='light',
    fontsize='x-large')
plt.tight_layout();

In [None]:
#stripping white space in front of words in column and getting dummy
yelp_compiled["new_categories"] = yelp_compiled['new_categories'].str.lstrip()


In [None]:
yelp_compiled.shape

In [None]:
yelp_compiled.head()

In [None]:
top_reviewed = yelp_compiled[yelp_compiled["stars"]>3]
top_reviews_dict ={}

for business_id in top_reviewed["business_id"].values:
    try :
        top_reviews_dict[business_id] =top_reviews_dict[business_id]+1
    except:
        top_reviews_dict[business_id]=1
        
topbusiness = pd.DataFrame.from_dict(data= top_reviews_dict,orient="index")

topbusiness.reset_index(inplace=True)
topbusiness.columns = ['business_id','rated']
# del(top_reviews_dict)
del(top_reviewed)

In [None]:
top_reviews_dict


In [None]:
topbusiness.head()

In [None]:
top_count= 20
right=pd.DataFrame(yelp_compiled[['business_id',"name","new_categories"]].values,
                    columns=['business_id',"Business name","new_categories"])

top_business_data = pd.merge(topbusiness,right=right, how="inner",on='business_id')
top_business_data.sort_values("rated")[::-1][:top_count].plot(x="Business name",y="rated", 
                                                   kind="bar",figsize=(14,6),
                                                   title='Positive reviews').set_ylabel("Total ratings")

del(topbusiness)
del(right)

In [None]:
top_business_data

In [None]:

# top_reviewed_high = income_high[income_high["stars"]>3 ]
# num_cat =10 # to show top 10 catrgories
# top_business = 30 # choose categories of top 30 businesses
# cat_data = top_reviewed_high.sort_values("rated")[::-1]
# cat_data.new_categories
# Categories={}
# for cat in cat_data.new_categories.values:
#     all_categories= cat.split(",")
#     for x in all_categories:
#         try :
#             Categories[x] =Categories[x]+1
#         except:
#             Categories[x]=1
# top_categories_high = pd.DataFrame.from_dict(data= Categories,orient="index")
# top_categories_high.reset_index(inplace=True)
# top_categories_high.columns = ['Category', 'Count']


In [None]:
# #plotting
# top_categories_high['Category'] = top_categories_high['Category'].str.lstrip()

# top_categories_sorted_h =top_categories_high.sort_values(by ='Count', ascending =False).head(20)
# plt.figure(figsize=(12,8))
# sns.barplot(x='Category',y='Count',data= top_categories_sorted_h) 
# plt.title("Ten Highly Ranked Categories in High Income Group", fontsize=15)
# plt.xlabel("")
# plt.ylabel("Count",fontsize =15)
# plt.xticks(
#     rotation= 90, 
#     horizontalalignment='center',
#     fontweight='light',
#     fontsize='x-large')
# plt.tight_layout();

In [None]:
top_reviewed_low = income_low[income_low["stars"]>3 ]
num_cat =10 # to show top 10 catrgories
top_business = 30 # choose categories of top 30 businesses
cat_data = top_business_data.sort_values("rated")[::-1]
cat_data.new_categories
Categories={}
for cat in cat_data.new_categories.values:
    all_categories= cat.split(",")
    for x in all_categories:
        try :
            Categories[x] =Categories[x]+1
        except:
            Categories[x]=1
top_categories_low = pd.DataFrame.from_dict(data= Categories,orient="index")
top_categories_low.reset_index(inplace=True)
top_categories_low.columns = ['Category', 'Count']

In [None]:
#plotting
top_categories_low['Category'] = top_categories_low['Category'].str.lstrip()

top_categories_sorted_l =top_categories_low.sort_values(by ='Count', ascending =False).head(20)
plt.figure(figsize=(12,8))
sns.barplot(x='Category',y='Count',data= top_categories_sorted_l) 
plt.title("Ten Highly Ranked Categories in Low Income Group", fontsize=15)
plt.xlabel("")
plt.ylabel("Count",fontsize =15)
plt.xticks(
    rotation= 90, 
    horizontalalignment='center',
    fontweight='light',
    fontsize='x-large')
plt.tight_layout();

In [None]:
#top_categories = top_categories.occurance.sort_values(ascending =False).head(20)

In [None]:
top_categories

In [None]:
#creating dummy for categories
yelp_compiled_cat = yelp_compiled['new_categories'].str.get_dummies(',')
#stripping trailing white space from the header after creating dummies
yelp_compiled_cat = yelp_compiled_cat.rename(columns=lambda x: x.strip())
#add up same named columns data after striping white space
yelp_compiled_cat =yelp_compiled_cat.groupby(yelp_compiled_cat.columns, axis=1).sum(numeric_only=True)

In [None]:
#adding up each category
yelp_categories =yelp_compiled_cat.sum(numeric_only =True)


In [None]:
yelp_compiled_cat['ethnic_food'] = yelp_compiled_cat[ 'Ethnic Food']+ yelp_compiled_cat['Greek'] +yelp_compiled_cat['Hawaiian'] + yelp_compiled_cat['Korean']
yelp_compiled_cat['Latin American']+yelp_compiled_cat ['Mediterranean']+ yelp_compiled_cat['Middle Eastern'] +yelp_compiled_cat['Vietnamese']

In [None]:
yelp_compiled_cat = yelp_compiled_cat[yelp_compiled_cat.columns[yelp_compiled_cat.sum()>100]]

In [None]:
print(yelp_compiled_cat.shape)
print(list(yelp_compiled_cat.columns))

In [None]:
yelp_compiled_cat['American'] = yelp_compiled_cat['American'].replace(2,1)
yelp_compiled_cat['ethnic_food'] = yelp_compiled_cat['ethnic_food'].replace(2,1)

In [None]:
#cat_drop_list =[ 'Arts & Entertainment', 'Food','Event Planning & Services']
yelp_compiled_cat = yelp_compiled_cat.drop(['Arts & Entertainment', 'Food','Event Planning & Services','Mediterranean'], axis =1)
#changing column_name to lower case
yelp_compiled_cat.columns = [re.sub(r"\s", '_', x).lower() for x in yelp_compiled_cat]
#joining category columns to origin dataframe
yelp_compiled_new =yelp_compiled.join(yelp_compiled_cat)
yelp_compiled_new.to_csv('Yelp_data/yelp_compiled_new.csv', index = False)

In [None]:
yelp_compiled_new =pd.read_csv('Yelp_data/yelp_compiled_new.csv')

In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(data = yelp_compiled_new, x ='stars', y ='median_income')
plt.xlabel('Stars');

In [None]:
# d = {'business_id':'business','price_range': 'avg_pric_range', 'romantic':'avg_romantic','intimate':'avg_intimate', 'touristy':'average_touristy', 
#      'hipster':'avg_hipster', 'divey':'avg_divey', 'classy':'avg_classey', 'trendy':'avg_trendy','upscale': 'avg_upscale',
#      'casual':'avg_casual'}
# yelp_compiled_tract =yelp_compiled_new.groupby(['census_tract','city'], as_index= False).agg({'review_count':'sum','stars':'mean','business_id':'count','price_range': 'mean', 'romantic':'mean', 
#                                                                                  'intimate':'mean', 'touristy':'mean', 'hipster':'mean', 
#                                                                                  'divey':'mean','classy':'mean', 'trendy':'mean','upscale': 'mean', 
#                                                                                  'casual':'mean', 'median_income':'mean', 'pop_2018':'mean',
#                                                                                     'american':'mean', 'asian_fusion':'mean', 'bakeries':'mean', 'barbeque':'mean', 'bars':'mean', 'beer':'mean', 'breakfast_&_brunch':'mean', 'buffets':'mean',
#                                                                                               'burgers':'mean', 'cafes':'mean', 'caterers':'mean', 'chicken_wings':'mean', 'chinese':'mean', 'cocktail_bars':'mean', 'coffee_&_tea':'mean', 'delis':'mean', 
#                                                                                               'desserts':'mean', 'diners':'mean', 'fast_food':'mean', 'italian':'mean', 'japanese':'mean', 'juice_bars_&_smoothies':'mean', 'lounges':'mean', 'mexican':'mean',
#                                                                                               'nightlife':'mean', 'pizza':'mean', 'pubs':'mean', 'salad':'mean', 'sandwiches':'mean', 'seafood':'mean', 'soup':'mean', 'specialty_food':'mean', 'sports_bars':'mean', 
#                                                                                               'steakhouses':'mean', 'sushi_bars':'mean', 'thai':'mean', 'vegan':'mean', 'vegetarian':'mean', 'wine_&_spirits':'mean', 'wine_bars':'mean', 'ethnic_food':'mean', 'weighted_star':'sum'}).rename(columns=d)
# yelp_compiled_tract['resturant_density'] = yelp_compiled_tract['pop_2018']/yelp_compiled_tract['business'] 

In [None]:
d = {'business_id':'business'
#      ,'price_range': 'avg_pric_range', 'romantic':'avg_romantic','intimate':'avg_intimate', 'touristy':'average_touristy', 
#      'hipster':'avg_hipster', 'divey':'avg_divey', 'classy':'avg_classey', 'trendy':'avg_trendy','upscale': 'avg_upscale',
     }
yelp_compiled_tract =yelp_compiled_new.groupby(['census_tract','city'], as_index= False).agg({'review_count':'sum','stars':'mean','business_id':'count','price_range': 'mean', 'romantic':'mean', 
                                                  'price_range': 'mean', 'romantic':'sum', 'intimate':'sum', 'touristy':'sum', 'hipster':'sum', 
                                                                                 'divey':'sum','classy':'sum', 'trendy':'sum','upscale': 'sum', 
                                                                                 'casual':'sum', 'median_income':'mean', 'pop_2018':'mean',
                                                                                    'american':'sum', 'asian_fusion':'sum', 'bakeries':'sum', 'barbeque':'sum', 'bars':'sum', 'beer':'sum', 'breakfast_&_brunch':'sum', 'buffets':'sum',
                                                                                              'burgers':'sum', 'cafes':'sum', 'caterers':'sum', 'chicken_wings':'sum', 'chinese':'sum', 'cocktail_bars':'sum', 'coffee_&_tea':'sum', 'delis':'sum', 
                                                                                              'desserts':'sum', 'diners':'sum', 'fast_food':'sum', 'italian':'sum', 'japanese':'sum', 'juice_bars_&_smoothies':'sum', 'lounges':'sum', 'mexican':'sum',
                                                                                              'nightlife':'sum', 'pizza':'sum', 'pubs':'sum', 'salad':'sum', 'sandwiches':'sum', 'seafood':'sum', 'soup':'sum', 'specialty_food':'sum', 'sports_bars':'sum', 
                                                                                              'steakhouses':'sum', 'sushi_bars':'sum', 'thai':'sum', 'vegan':'sum', 'vegetarian':'sum', 'wine_&_spirits':'sum', 'wine_bars':'sum', 'ethnic_food':'sum','weighted_star':'sum'}).rename(columns=d)
yelp_compiled_tract['restaurant_density'] = yelp_compiled_tract['business'] /yelp_compiled_tract['pop_2018']*100
yelp_compiled_tract['income_range'] = yelp_compiled_tract['median_income'].apply(income_classifier)
yelp_compiled_tract['ranking'] =yelp_compiled_tract['weighted_star']/617
yelp_compiled_tract['average_review'] =yelp_compiled_tract['review_count']/yelp_compiled_tract['business'] 

In [None]:
# columns_to_divide =[ 'romantic', 'intimate', 'touristy']
yelp_compiled_tract[[ 'romantic', 'intimate', 'touristy', 'hipster', 'divey', 'classy', 'trendy', 'upscale', 'casual', 'american',
 'asian_fusion', 'bakeries', 'barbeque', 'bars', 'beer', 'breakfast_&_brunch', 'buffets', 'burgers', 'cafes', 'caterers', 'chicken_wings', 'chinese',
 'cocktail_bars', 'coffee_&_tea', 'delis', 'desserts', 'diners', 'fast_food', 'italian', 'japanese', 'juice_bars_&_smoothies', 'lounges',
 'mexican', 'nightlife', 'pizza', 'pubs', 'salad', 'sandwiches', 'seafood', 'soup', 'specialty_food', 'sports_bars', 'steakhouses', 'sushi_bars',
 'thai', 'vegan', 'vegetarian', 'wine_&_spirits', 'wine_bars', 'ethnic_food']] =yelp_compiled_tract[[ 'romantic', 'intimate', 'touristy', 'hipster', 'divey', 'classy', 'trendy', 'upscale', 'casual', 'american',
 'asian_fusion', 'bakeries', 'barbeque', 'bars', 'beer', 'breakfast_&_brunch', 'buffets', 'burgers', 'cafes', 'caterers', 'chicken_wings', 'chinese',
 'cocktail_bars', 'coffee_&_tea', 'delis', 'desserts', 'diners', 'fast_food', 'italian', 'japanese', 'juice_bars_&_smoothies', 'lounges',
 'mexican', 'nightlife', 'pizza', 'pubs', 'salad', 'sandwiches', 'seafood', 'soup', 'specialty_food', 'sports_bars', 'steakhouses', 'sushi_bars',
 'thai', 'vegan', 'vegetarian', 'wine_&_spirits', 'wine_bars', 'ethnic_food']].div(yelp_compiled_tract['business'].values,axis=0)


In [None]:
yelp_compiled_tract.restaurant_density = yelp_compiled_tract.restaurant_density.replace(np.inf, np.nan)

In [None]:
yelp_compiled_tract=yelp_compiled_tract.dropna(axis =0)

In [None]:
yelp_compiled_tract.to_csv('Yelp_data/yelp_compiled_tract.csv', index = False)

In [None]:
from ipywidgets import interact
@interact(y = ['restaurant_density', 'romantic', 'intimate', 'touristy', 'hipster', 'divey', 'classy', 'trendy', 'upscale', 'casual'])
def make_scatter(y):
    yelp_compiled_tract.plot(kind = 'scatter', x = 'median_income', y = y);

In [None]:
yelp_compiled_tract[yelp_compiled_tract['restaurant_density']> 30]

In [None]:
from ipywidgets import interact
@interact(y = [ 'american',
 'asian_fusion', 'bakeries', 'barbeque', 'bars', 'beer', 'breakfast_&_brunch', 'buffets', 'burgers', 'cafes', 'caterers', 'chicken_wings', 'chinese',
 'cocktail_bars', 'coffee_&_tea', 'delis', 'desserts', 'diners', 'fast_food', 'italian', 'japanese', 'juice_bars_&_smoothies', 'lounges',
 'mexican', 'nightlife', 'pizza', 'pubs', 'salad', 'sandwiches', 'seafood', 'soup', 'specialty_food', 'sports_bars', 'steakhouses', 'sushi_bars',
 'thai', 'vegan', 'vegetarian', 'wine_&_spirits', 'wine_bars', 'ethnic_food'])
def make_scatter(y):
    yelp_compiled_tract.plot(kind = 'scatter', x = 'median_income', y = y);

In [None]:
import plotly.express as px

In [None]:
fig, ax = plt.subplots(figsize = (9,7))
sns.boxplot(x = 'income_range', y = 'median_income', data = yelp_compiled_tract);

In [None]:
feature_cols = yelp_compiled_tract.columns.drop([ 'city', 'census_tract', 'median_income', 'income_range'])

In [None]:
X= yelp_compiled_tract[feature_cols]
y= yelp_compiled_tract['income_range']

In [None]:
print(X.shape)
print(y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size =0.3, random_state = 321)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

In [None]:
%time
clf= LogisticRegression(solver='lbfgs',multi_class='multinomial')
clf.fit(X_train, y_train)
print('Coefficients: ', clf.coef_)
print('Intercept: ', clf.intercept_)

In [None]:
probas = clf.predict_proba(X_train)
preds = clf.predict(X_train)

In [None]:
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
# Print Classification report to look at precision, recall and f1 score
print(classification_report(y_test,y_pred))

In [None]:
from sklearn import metrics
# print accuracy scores
print('Accuracy Score: ', accuracy_score(y_test, y_pred))
print('F1 Score: ', metrics.f1_score(y_test,  y_pred, average='weighted'))

In [None]:
# print confusion matrix and plot the heatmap 
log_mod_cm = confusion_matrix(y_test,y_pred)
print(log_mod_cm)
# plot confusion matrix
sns.set(rc={'figure.figsize':(8,6)}, font_scale=2)
ax= plt.subplot()
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, ax = ax, fmt='g', linewidths=.5, cmap='coolwarm'); #annot=True to annotate cells
# labels, title and ticks
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix')
plt.tight_layout()

In [None]:
top15 = features.head(15)
bottom15 =features.tail(15)
top_features = pd.concat([top15,bottom15])
top_features.head()

In [None]:
# plot the classifiers to visualize the class imbalance. 
sns.set(rc={'figure.figsize':(15,8)}, font_scale=1.6)
sns.set_style("ticks")
sns.barplot(top_features.importance, top_features.feature, alpha=1)
plt.title('Logmod Feature Importance', fontsize = 20)
plt.ylabel('Features', fontsize=20)
plt.xlabel('Feature Iportance', fontsize=20)
plt.yticks(fontsize = 15)
plt.tight_layout()

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
# #binning year to date range
# yelp_compiled_tract['income_bins'] = pd.cut(x=yelp_compiled_tract['median_income'], bins=[16206, 53383, 73199, 173056], labels =['Low','Medium','High'])

In [None]:
from imblearn.under_sampling import RandomUnderSampler
#under sampler


In [None]:
undersampler = RandomUnderSampler(random_state = 321)
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)
y_resampled.value_counts()

In [None]:
clf.fit(X_resampled, y_resampled)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

In [None]:
#over sampler
oversampler = RandomOverSampler(random_state = 321)
X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)
y_resampled.value_counts()

In [None]:
clf.fit(X_resampled, y_resampled)

In [None]:
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))


In [None]:
#SMOTE Resmapling
from imblearn.over_sampling import SMOTE

In [None]:
oversampler = SMOTE(k_neighbors =5, n_jobs =-1, random_state=321)
X_smote, y_smote = oversampler.fit_resample(X_train, y_train)
y_smote.value_counts()

In [None]:
clf.fit(X_smote, y_smote)

In [None]:
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

In [None]:
@interact(City=yelp_compiled_tract.city.dropna().unique())
def make_plot(City):
    data= yelp_compiled_tract[yelp_compiled_tract.city==City].plot(kind='scatter',x='avg_pric_range',y='median_income'),
    plt.xlabel('Price range'),
    plt.ylabel('Median Income');

In [None]:
@interact(City=yelp_compiled_tract.city.dropna().unique())
def make_plot(City):
    data= yelp_compiled_tract[yelp_compiled_tract.city==City].plot(kind='scatter',x='avg_pric_range',y='median_income'),
    plt.xlabel('Price range'),
    plt.ylabel('Median Income');

In [None]:
from IPython.display import Image
import plotly.express as px

In [None]:

fig = px.scatter(yelp_compiled_tract, x="restaurant_density", y="median_income", color ="city", trendline="ols", labels ={ "wins":'Wins Percentage',"total_salary": "Salary per Team","year_bins": "Year Range"},
                 title ="Corelation Between Stars and Median Income ")
fig.show()

In [None]:
# fig = px.scatter(yelp_compiled_tract, x="stars", y ='median_income', facet_col = "income_bins",color ="city")
# #                 title ="Total Salary Over The Years : 2000 - 2016")
# fig.show()