# Machine Learning Project

In [15]:
import pandas as pd
import nltk
from nltk.util import ngrams
from collections import Counter
from configparser import ConfigParser
from sklearn.feature_extraction.text import CountVectorizer
#########################################################
##                                                     ##
##              IMPORTANT                              ##
## If it's the first time you run the program, please  ##
## uncomment the following lines :                     ##
##                                                     ##
#########################################################

#nltk.dowload('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')



In [2]:
#Load reviews and aggregate them in a str var
### Arguments : 
#   - path : The path of the file containing our data
### Output : 
#   - df : the dataframe corresponding to our data file
#   - reviews : An array containing all the reviews
def retrieveData(path):
    df = pd.read_json(path, lines=True)

    reviews = []
    for review in df['reviewText'].items():
        reviews.append(review[1])

    return (df, reviews)

In [3]:
#Tokenize and return ('word',#apparitions) for the most common nb_token
def tokenize(reviews, nb_tokens=100):
    #We start with a string containg all the reviews
    #We retrieve all the words, then only keep alphabetic ones (no numbers)
    tokens = nltk.tokenize.word_tokenize(reviews)
    alpha_tokens = [t for t in tokens if t.isalpha()] 
    #We get rid of capital letters, in order to count word occurence properly
    lower_tokens = [t.lower() for t in alpha_tokens]

    #We get rid of stop words 
    stop_words = set(nltk.corpus.stopwords.words('english'))
    no_stops_tokens = [t for t in lower_tokens if t not in(stop_words)]

    wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized_tokens = [wordnet_lemmatizer.lemmatize(t) for t in no_stops_tokens]
    count = Counter(lemmatized_tokens)
    
    return count.most_common(nb_tokens)

In [13]:
#Tag filter
def tag(tokens, tag):
    tagged = nltk.pos_tag(tokens)
    return [t[0] for t in tagged if t[1] == tag]

In [4]:
#Extract most relevant features from all the reviews
def extract(reviews):
    totalReviews = " "
    for review in reviews:
        totalReviews = totalReviews + review + "\n"
        
    # Retrieve the 300 most common relevant words 
    token_count = tokenize(totalReviews, 300)
    tokens = [t[0] for t in token_count]
        
    # Retrieve the nouns out of the 300 most common words
    nouns = tag(tokens, 'NN')
    print('nouns : ', nouns[0:30])
    #adjectives=tag(tokens,'JJ')
    #return nouns[0:30], adjectives[0:20]
    return nouns[0:30]

In [5]:
#Filtering most common words if reviews
def filter(review, features):
    tokens = [t[0] for t in tokenize(review)]
    filtered_tokens = [t for t in tokens if t in features]
    return filtered_tokens

In [56]:
def prepareData(df, reviews):
    # We call the extract method to retrieve the most relevant words (features)
    features = extract(reviews)  
    
    # For each review, we only keep the words that are features
    filtered_tokens = [filter(review, features) for review in reviews]
    filtered_reviews = []
    for f in filtered_tokens:
        review = ""
        for t in f:
            review = review + t + " "
        filtered_reviews.append(review)

    # We create a column for each feature
    # If that feature is mentioned in the review : the value is 1, else 0
    cv = CountVectorizer(binary=True)
    x = cv.fit_transform(filtered_reviews)
    return pd.DataFrame(x.toarray(), columns=cv.get_feature_names())

In [57]:
config = ConfigParser()
config.read('init.cfg')
path = config['RESOURCES']['path']
dataFile = config['RESOURCES']['dataFile']

# We retrieve a dataframe corresponding to our data file
# and an array containing only the reviews
df, reviews = retrieveData(path + dataFile)

# We add to it columns. One for each feature (relevant word).
# If a review contains a feature, the corresponding column will have value 1, else 0.
df_features = prepareData(df, reviews)

nouns :  ['case', 'phone', 'otterbox', 'iphone', 'screen', 'protection', 'time', 'product', 'get', 'defender', 'rubber', 'color', 'bulky', 'protector', 'use', 'drop', 'look', 'price', 'work', 'box', 'feel', 'thing', 'month', 'year', 'silicone', 'clip', 'pocket', 'belt', 'part', 'quality']


In [58]:
df_features.shape

(837, 30)

In [59]:
df_features.iloc[0]

belt          0
box           0
bulky         0
case          0
clip          0
color         0
defender      0
drop          0
feel          0
get           0
iphone        1
look          0
month         0
otterbox      0
part          0
phone         0
pocket        0
price         1
product       1
protection    0
protector     0
quality       0
rubber        0
screen        0
silicone      0
thing         0
time          0
use           0
work          0
year          0
Name: 0, dtype: int64