### Summary of Analysis

For this analysis I used the pandas, pylab, and numpy libraries in Python. I also used statsmodels to fit a logistic regression with 'admit' as the response variable and gre, gpa and prestige as predictor variables.

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
%matplotlib inline

In [None]:
df=pd.read_csv('/Users/ranaquadri/Documents/recipe/recipe_master.csv')

In [None]:
df.drop(['title','recipe_id'], axis=1, inplace=True)

In [None]:
# Check for missing values

total = df.isnull().sum().sort_values(ascending=False)
pct = (df.isnull().sum() / df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, pct], axis=1, keys=['Total', 'Percent'])
missing_data.loc[missing_data['Total'] > 0]

In [None]:
# Drop observations with missing rating and/or nutritional info
df = df[pd.notnull(df['rating'])]
df = df[pd.notnull(df['nut_cals'])]
print len(df)

In [None]:
# Drop 4 observations where rating <2
df[df.rating < 2].count()

df = df.drop(df[df.rating < 2].index)

In [None]:
# Drop observations with > 5K reviews
df[df.num_reviews > 5000].count()

df = df.drop(df[df.num_reviews>5000].index)

In [None]:
# Set to null 56 observations where ratio sugar_flour >5. 
df[df.ratio_sugar_flour > 5].count()
df.loc[df.ratio_sugar_flour > 5, 'ratio_sugar_flour'] = np.nan
df.loc[df.ratio_fat_flour > 5, 'ratio_fat_flour'] = np.nan

In [None]:
# Set data types of variables
df['pos_adj']=df['pos_adj'].astype('bool');
df['health_adj']=df['health_adj'].astype('bool');
df['nut_sod']=df['nut_sod'].astype('int');
df['nut_choles']=df['nut_choles'].astype('int');
df['num_reviews']=df['num_reviews'].astype('int');

In [None]:
# Create bins for rating in order to use neural network
df['rat_cat']=pd.cut(df['rating'], bins=4, labels=False)

In [None]:
df_cluster=df_cluster[df_cluster.apply(lambda x: np.abs(x - x.mean()) / x.std() < 3).all(axis=1)]

In [None]:
df.describe()

In [None]:
df['recipe_type'].value_counts()

In [None]:
### Data Visualization

# Scatterplot of ratio sugar/flour vs rating
df.plot.scatter(x='rating', y='ratio_sugar_flour')

# Scatterplot of ratio sugar/flour vs rating
df.plot.scatter(x='rating', y='ratio_fat_flour')

# Scatterplot of ratio sugar/flour vs rating
df.plot.scatter(x='rating', y='num_reviews')

# Scatterplot of ratio sugar/flour vs rating
df.plot.scatter(x='rating', y='nut_cals')

In [None]:
# plot the distribution of each variable 
df["ratio_sugar_flour"].plot(kind="density")
df["ratio_fat_flour"].plot(kind="density")

In [None]:
df["nut_cals"].plot(kind="density")

In [None]:
df_cluster=df[['num_reviews','nut_cals','nut_carb','nut_choles','nut_fat','nut_prot','nut_sod','social_rank',
             'sugar','fat','dry','recipe_type']]

In [None]:
df_cluster['rec_type'] = df['recipe_type'].map({'other':0, 'cookies': 1, 'cake':2, 'scones':3, 'pie':4, 'brownies':5, 'snickerdoodles':1,'muffins':7})

In [None]:
df_cluster.drop(['recipe_type'], axis=1, inplace=True)

In [None]:
ct=pd.crosstab(df.rat_cat, df.pos_adj).apply(lambda r: r/r.sum(), axis=1)
stacked = ct.stack().reset_index().rename(columns={0:'value'})
 # plot grouped bar chart
sn.barplot(x=stacked.rat_cat, y=stacked.value, hue=stacked.pos_adj)
plt.legend(loc='upper center')
plt.xlabel("Rating")
plt.ylabel("Percent")
plt.title("Positive Adjective vs Rating")

In [None]:
ct=pd.crosstab(df.rat_cat, df.health_adj).apply(lambda r: r/r.sum(), axis=1)
stacked = ct.stack().reset_index().rename(columns={0:'value'})
 # plot grouped bar chart
sn.barplot(x=stacked.rat_cat, y=stacked.value, hue=stacked.health_adj)
plt.legend(loc='upper center')
plt.xlabel("Rating")
plt.ylabel("Percent")
plt.title("Health Adjective vs Rating")

In [None]:
print df['rating'].describe()
print("Skewness: {0:0.3f}".format(df['rating'].skew()))
print("Kurtosis: {0:0.3f}".format(df['rating'].kurt()))

In [None]:
# Correlation map
f, ax = plt.subplots(figsize=(9,9))
sns.heatmap(df.corr(), vmin=-1, vmax=+1)

In [None]:
from sklearn.preprocessing import StandardScaler
df_cluster1 = StandardScaler().fit_transform(df_cluster)

In [None]:
#KMeans
sns.set_context('poster')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.5, 's' : 80, 'linewidths':0}

from sklearn.cluster import KMeans
km_cluster_model = KMeans(n_clusters=6, random_state=1)
km_cluster_model.fit(df_cluster)
labels = km_cluster_model.predict(df_cluster1)


palette = sns.color_palette('deep', np.unique(labels).max() + 1)
colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
plt.figure(figsize=(14,8))
plt.scatter(df_cluster1.T[0], df_cluster1.T[1], c=colors, **plot_kwds)
frame = plt.gca()
frame.axes.get_xaxis().set_visible(False)
frame.axes.get_yaxis().set_visible(False)
plt.title('Clusters found by {}'.format(str(KMeans.__name__)), fontsize=24)

In [None]:
df_cluster["cluster"] = labels
df_cluster = pd.concat([df_cluster, pd.get_dummies(df_cluster['cluster'], prefix="cluster")], axis=1)
df_cluster.head()

In [None]:
import statsmodels.formula.api as smf

lm = smf.ols(formula='rating ~ pos_adj+health_adj+nut_prot+num_reviews+ratio_sugar_flour+recipe_type', data=df).fit()
lm.summary()

In [None]:
cols_to_keep=['num_reviews','nut_cals','nut_carb','nut_choles','nut_fat','nut_prot','nut_sod','social_rank',
             'sugar','fat','dry', 'pos_adj','health_adj']

In [None]:
plt.hist(df['rat_cat'])


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(df[cols_to_keep], df['rat_cat'])

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

print(X_train.shape)
print(y_train.shape)

In [None]:
import numpy as np
np.unique(y_train)

In [None]:
model = Sequential()

n_input = X_train.shape[1]
n_hidden = n_input
n_output = 4

model.add(Dense(n_hidden, input_dim=n_input, activation='relu'))
model.add(Dense(n_output, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), 
                    epochs=400, batch_size=None, verbose=0)

In [None]:
history.history['acc'][-1]

In [None]:
plt.plot(history.history['acc'])

In [None]:
train_loss = history.history['loss']
test_loss = history.history['val_loss']
plt.plot(train_loss, label='Training loss')
plt.plot(test_loss, label='Testing loss')
plt.legend()

In [None]:
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='lower right')
plt.show()

In [None]:
score = model.evaluate(X_test, y_test, verbose=0)
print score

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_test_non_category = [ np.argmax(t) for t in y_test ]
y_predict_non_category = [ np.argmax(t) for t in y_pred ]

from sklearn.metrics import confusion_matrix, precision_score,recall_score
conf_mat = confusion_matrix(y_test_non_category, y_predict_non_category)

print conf_mat

In [None]:
precision_score(y_test_non_category, y_predict_non_category,average="micro")

In [None]:
# Recall
recall_score(y_test_non_category, y_predict_non_category,average="micro")

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

       
df_cm = pd.DataFrame(conf_mat, range(4),
                  range(4))
sn.set(font_scale=1.1)
sn.heatmap(df_cm, annot=True,annot_kws={"size": 13},cmap="YlGnBu")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix for Neural Network") 