# Natural Language Processing

In this NLP project we will classify Yelp Reviews into 1 star or 5 star categories based off the text content in the reviews.

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
yelp = pd.read_csv('../input/yelp-data/yelp.csv')

In [None]:
yelp.head()

In [None]:
yelp.info()

In [None]:
yelp.describe()

In [None]:
yelp['length']=yelp['text'].apply(len)
yelp.head()

## EDA

In [None]:
g = sns.FacetGrid(data=yelp, col='stars')
g.map(plt.hist,'length')

In [None]:
sns.boxplot(x='stars',y='length', data=yelp)

In [None]:
sns.countplot(x='stars',data=yelp)

In [None]:
stars = yelp.groupby('stars').mean()

In [None]:
stars.corr()

In [None]:
sns.heatmap(stars.corr(), cmap = 'coolwarm', annot=True)

## NLP Classification

Only the 1 or 5 star reviews to make easier

In [None]:
yelp_class =yelp[(yelp.stars==1)|(yelp.stars==5)]

In [None]:
X=yelp['text']
y=yelp['stars']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
X = cv.fit_transform(X)

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

## Training a Model

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train,y_train)

## Predictions and Evaluations

In [None]:
predictions = nb.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

Let's see what happens if we try to include TF-IDF to this process using a pipeline.

## Using Text Processing - TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

## Using the Pipeline

Pipeline has all pre-process steps in it already, meaning we'll need to re-split the original data (we overwrote X as the CountVectorized version. What we need is just the text)

In [None]:
X=yelp_class['text']
y=yelp_class['stars']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
pipeline.fit(X_train,y_train)

## Predictions and Evaluation

In [None]:
predictions = pipeline.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

 Tf-Idf actually made things worse
