## Reading required packages:
#### sklearn: package which has functions used to perform machine learning 
#### nltk: package used to perform natural language processing
#### pandas: package used to read and write files

In [None]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords

### Copying stop words into stop object

In [None]:
stop_words = list(set(stopwords.words('english')))
stop_words[:5]

### Defining a stemmer object to extract root words

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()



### Stop words:
Some examples of stop words are: "a", "and", "but", "how", "or", and "what." While the majority of all Internet search engines utilize stop words, they do not prevent a user from using them, but they are ignored.

### Stemming:
A stemmer for English, for example, should identify the string "cats" (and possibly "catlike", "catty" etc.) as based on the root "cat", and "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming algorithm reduces the words "fishing", "fished", and "fisher" to the root word, "fish"

In [None]:
# read yelp.csv into a DataFrame

yelp = pd.read_csv("yelp.csv",encoding = "ISO-8859-1")

# create a new DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]

# define X and y
X = yelp_best_worst.text
X.apply(lambda x: [item for item in x if item not in stop_words])

def stem1(text1):
    return ps.stem(text1)

X=X.apply(stem1)
y = yelp_best_worst.stars

# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Displaying top few rows of yelp dataframe

In [None]:
yelp_best_worst.head()


In [None]:
cveg = CountVectorizer()

x_example = cveg.fit_transform(["how are you", "We are are fine", "are we fine", "yes we are fine"])
x_example.toarray()


In [None]:
cveg.get_feature_names()

### Vectorizer transformation example for one hot encoding

|are | fine | how | we | yes | you |        
| :- |: | :| :|: | : 
|1| 0  | 1 | 0 | 0 | 1 
|2| 1  | 0 | 1 | 0 | 0 
|1| 1  | 0 | 1 | 0 | 0 
|1| 1  | 0 | 1 | 1 | 0 


In [None]:
# use CountVectorizer to create document-term matrices from X_train and X_test
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [None]:
# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
# use CountVectorizer with text column only
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)


### Apply logistic regression for classification

In [None]:
# use logistic regression with text column only
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train_dtm, y_train)
y_pred_class = logreg.predict(X_test_dtm)
print(metrics.accuracy_score(y_test, y_pred_class))

### Print confusion matrix

In [None]:
confusion_matrix(y_test, y_pred_class)