#### Importing Libraries
* Forked originally from : https://www.kaggle.com/chitralc1/random-forest-for-predicting-ratings

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### Reading Data
* Note: Parsing is problematic for some tools due to inconsistent quotations on column headers vs rows and the newlines in the column header. (Terrible bad practice!)
    * **We'll parse it through pandas first then re-export for this reason

In [None]:
df = pd.read_csv('../input/flavors_of_cacao.csv')

#### Data Exploration

In [None]:
df.head()

In [None]:
df.columns

#### Clean column names
* Here issues are due to newlines.
* Could also deal with whitespace: http://jonathansoma.com/lede/foundations/classes/pandas%20columns%20and%20functions/fixing-column-names-in-pandas/

In [None]:
df.columns = df.columns.str.replace("\\n","-").str.replace(" ","-").str.strip(" ")
df.columns

In [None]:
df.columns

In [None]:
df['Review-Date'] = pd.to_datetime(df['Review-Date'],format="%Y")

In [None]:
df.to_csv("ChocolateReviews.csv.gz",index=False,compression="gzip")

#### Data Metrics

In [None]:
df.info()

In [None]:
df.describe()

#### Checking for NaN Attributes

In [None]:
df.isnull().sum()


#### Heat Map for better Visualization

In [None]:
sns.heatmap(df.isnull(), cbar = False, cmap='coolwarm')

#### Different Bean type and their counts

In [None]:
df['Bean\nType'].value_counts()

#### Total number of Beans 

In [None]:
df['Bean\nType'].nunique()

#### Checking for correlation

In [None]:
sns.heatmap(df.corr())

In [None]:
df.columns

#### Getting Unique Values for every text related column 

In [None]:
print('Unique Values:')
print('Company (Maker-if known): ',df['Company\xa0\n(Maker-if known)'].nunique())
print('Specific Bean Origin or Bar Name: ', df['Specific Bean Origin\nor Bar Name'].nunique())
print('Company Location: ',df['Company\nLocation'].nunique())
print('Bean Type: ', df['Bean\nType'].nunique())
print('Broad Bean Origin', df['Broad Bean\nOrigin'].nunique())
print('Review Date: ', df['Review\nDate'].nunique())
print('Cocoa Percent: ', df['Cocoa\nPercent'].nunique())

#### Data Visualization


#### Rating Distribution

In [None]:
sns.countplot(x = df['Rating'])

About 370 ratings below to 3.5 followed by 3.0 

#### Year-wise distribution

In [None]:
sns.countplot(x = df['Review\nDate'])

#### Rating and Review Date Concentrations

In [None]:
sns.jointplot(x = 'Rating', y= 'Review\nDate', data = df, kind='kde', color = 'brown')

#### Converting String into Integers for better classification

In [None]:
df['Cocoa\nPercent'] = df['Cocoa\nPercent'].str.replace('%', '')
df['Cocoa\nPercent'] = df['Cocoa\nPercent'].str.replace('.', '')
df['Cocoa\nPercent'] = df['Cocoa\nPercent'].astype(int)

#### Corrections: Cocoa Percent cannot be above 100 %
* 75.5% --->   75.5
* 75.5 --->   755

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x= 'Cocoa\nPercent', data = df, color = 'brown')

#### To fix the above error

In [None]:
def normalizeIt(percent):
    if percent > 100:
        percent = int(str(percent)[:2])
    return percent

In [None]:
df['Cocoa\nPercent'] = df['Cocoa\nPercent'].apply(normalizeIt)

#### Let's Plot it again

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x= 'Cocoa\nPercent', data = df, color = 'brown')

It worked!!

#### Converting Rating

In [None]:
df['Rating'] = (df['Rating']* 100).astype(int)
df['Rating'].head(5)

In [None]:
df.columns

#### Featurizing Text

In [None]:
company = pd.get_dummies(df['Company\xa0\n(Maker-if known)'],drop_first=True)
sbOrigin = pd.get_dummies(df['Specific Bean Origin\nor Bar Name'],drop_first=True)
companyLocation = pd.get_dummies(df['Company\nLocation'],drop_first=True)
bType = pd.get_dummies(df['Bean\nType'],drop_first=True)
bbOrigin = pd.get_dummies(df['Broad Bean\nOrigin'],drop_first=True)

In [None]:
df = pd.concat([df, company, sbOrigin, companyLocation, bType, bbOrigin], axis = 1)

#### Dropping Columns which have been Featurized

In [None]:
df.drop(['Company\xa0\n(Maker-if known)', 'Specific Bean Origin\nor Bar Name','Company\nLocation', 'Bean\nType', 
         'Broad Bean\nOrigin'], axis = 1, inplace = True )

#### Removing Duplicate Column 
Added due to featurization.

[StackOverFlow link](https://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns)

In [None]:
df = df.loc[:,~df.columns.duplicated()]


#### Splitting Into Training and Testing data sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop('Rating', axis = 1) #Features
y = df['Rating']   # Target Variables
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=7)

#### Importing Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=20)
rfc.fit(X_train, y_train)

In [None]:
rfc_pred = rfc.predict(X_test)

#### Let's Compare how the model performed

In [None]:
from sklearn.metrics import classification_report, accuracy_score

In [None]:
print(classification_report(y_test,rfc_pred))

In [None]:
print(accuracy_score(y_test,rfc_pred)*100)