В этом задании вам нужно воспользоваться опытом предыдущих недель, чтобы побить бейзлайн в соревновании по сентимент-анализу отзывов на товары на Kaggle Inclass:

https://inclass.kaggle.com/c/product-reviews-sentiment-analysis-light 

В качестве ответа в этом задании вам нужно загрузить ноутбук с решением и скриншот вашего результата на leaderboard.

Убедитесь, что:

1) ход вашего решения задокументирован достаточно подробно для того, чтобы ваши сокурсники поняли, что вы делали и почему,

2) ваша команда в соревновании состоит только из вас и названа вашим логином на Сoursera, чтобы ваши сокурсники могли понять, что на скриншоте именно ваш результат

In [318]:
import nltk
import numpy as np
import xgboost as xg
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import  LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from nltk.stem import WordNetLemmatizer 
from nltk import  pos_tag ,word_tokenize
#from nltk.tag import pos_tag
import re as regex
import pandas as pd
#import emoji

In [319]:
#nltk.download()


In [320]:

nltk.data.path.append('/home/doniyor/Final_Project/nltk_data/')

In [321]:
#Function for writing to file

def AnswerWrite(prediction, filename):
  #np.savetxt('np.csv', a, fmt='%.2f', delimiter=',', header=" #1,  #2,  #3,  #4")
  answer = np.vstack([[i for i in np.arange(0, len(prediction),1)], prediction])
  np.savetxt(filename, answer.T, fmt='%.0f', delimiter=',', header="Id,y")

In [322]:
# Load data
data_train = pd.read_csv("/home/doniyor/Final_Project/Part3/products_sentiment_train.tsv", sep='\t')
data_test = pd.read_csv("/home/doniyor/Final_Project/Part3/products_sentiment_test.tsv", sep='\t')

In [323]:
data_train.head()

Unnamed: 0,Text,Category
0,"2 . take around 10,000 640x480 pictures .",1
1,i downloaded a trial version of computer assoc...,1
2,the wrt54g plus the hga7t is a perfect solutio...,1
3,i dont especially like how music files are uns...,0
4,i was using the cheapie pail ... and it worked...,1


In [324]:
data_test.head()

Unnamed: 0,Id,Text
0,0,"'so , why the small digital elph , rather than..."
1,1,3/4 way through the first disk we played on it...
2,2,better for the zen micro is outlook compatibil...
3,3,6 . play gameboy color games on it with goboy .
4,4,"'likewise , i ''ve heard norton 2004 professio..."


### Cleaning
 * #### From special symbols

In [325]:
def clean_symbols(data,column='Text'):
    for remove in map(lambda r: regex.compile(regex.escape(r)), [",", " : ", "\"", "=", "&", ";", "%", "$",
                                                                     "@", "%", "^", "*", "{", "}",
                                                                     "[", "]", "|", "/", "\\", ">", "<", "-",
                                                                     "!", "?", ".", "'",
                                                                     "--", "---", "#",
                                                                 "(", ")",
                                                                 ]):
            data.loc[:, "Text"].replace(remove, " ", inplace=True)    
    return data
                                                                     

In [326]:
def remove_by_regex(data ,regexp):
        data.loc[:, "Text"].replace(regexp, " ", inplace=True)
        return data

In [327]:
def remove_urls(data):
     return remove_by_regex(data, regex.compile(r"http.?://[^\s]+[\s]?"))



In [328]:
def remove_usernames(data):
     return remove_by_regex(data, regex.compile(r"@[^\s]+[\s]?"))


In [329]:
def remove_numbers(data):
     return remove_by_regex(data, regex.compile(r"\s?[0-9]+\.?[0-9]*"))

In [330]:
def clean_full(data):
    data = clean_symbols(data)
    data = remove_urls(data)
    data = remove_usernames(data)
    data = remove_numbers(data)
    return data

In [331]:
data_train = clean_full(data_train) # cleande train

In [332]:
data_test = clean_full(data_test) # cleaned test

* #### Stemming (Used lemmatization instead)

In [333]:
#Stemming our data
stemmer = nltk.PorterStemmer()
analyzerStem = CountVectorizer().build_analyzer()

def stemmed_words(frame):
    return (stemmer.stem(w) for w in analyzer(frame))


* #### Lemmatization

In [334]:
#Lemmatiztion
lemmatizer = WordNetLemmatizer()
analyzerLem = CountVectorizer().build_analyzer()

# Function for part of speech detect
def pos_prep(word):
    pos = pos_tag(word_tokenize(word))[0][1]
    if pos.startswith('J'):
        return (word,'a')
    elif pos.startswith('V'):
        return (word,'v')
    elif pos.startswith('N'):
        return (word,'n')
    elif pos.startswith('R'):
        return (word,'r')
    else:
        return (word,'n')
    
    
    
#Lemmatize words
def lemmatized_words(frame):
        arr = []
        for w in analyzerLem(frame):
            prepare = pos_prep(w)
            arr.append(lemmatizer.lemmatize(prepare[0],pos=prepare[1]))
            
        return arr

In [335]:
# CountVectorizer
vectorizerCountLem = CountVectorizer(analyzer=lemmatized_words);

In [337]:
# Simple vectorizer and TF-idF vectorizer
vectorizerCount = CountVectorizer(analyzer=stemmed_words);
vectorizerTFidf = TfidfVectorizer(analyzer=stemmed_words);

In [338]:
#Prepare train and test data
train_text = data_train["Text"]
train_label = data_train["Category"]
test_text = data_test["Text"]

### Logistic Regression

In [262]:
#Classificator
clflog = LogisticRegression();

In [264]:
#Pipeline with vectorizer and classificator
pipe = Pipeline([('vectorizer',vectorizerCountLem),('clf', LogisticRegression())])

In [265]:
#GridSearch parameters
grid_params =     {
     'vectorizer__min_df':[0,1,10,20,30],
     'vectorizer__max_df':[0.9,0.80,0.7,1.0],
     'vectorizer__analyzer':['word'],
     'vectorizer__ngram_range':[(1,2)],
     #'vectorizer__stop_words':[nltk.corpus.stopwords.words('english2')],
    
      'clf__C': [1.0,0.5,0.7,0.9,0.25,0.1],
      'clf__penalty': ['l2','l1'],
      'clf__class_weight': ['balanced',None]       
    }
    

In [235]:
#GridSearchCV
grid = GridSearchCV(pipe, cv=3,scoring='accuracy', param_grid=grid_params, n_jobs=-1)

In [236]:
#Fit train data to GridSearchCV
grid.fit(train_text, train_label)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer=<function lemmatized_words at 0x7f83412b3d08>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), prep...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'vectorizer__min_df': [0, 1, 10, 20, 30], 'vectorizer__max_df': [0.9, 0.8, 0.7, 1.0], 'vectorizer__analyzer': ['word'], 'vectorizer__ngram_range': [(1, 2)], 'clf__C': [1.0, 0.5, 0.7, 0.9, 0.25, 0.1], 'clf__penalty': ['l2', 'l1'], 'clf__class_weight': ['balanced', None]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [237]:
#Score of best estimator
grid.best_score_

0.774

In [238]:
AnswerWrite(grid.best_estimator_.predict(test_text),"test10.csv")

### SGD CLassifier

In [57]:
pipeSGD = Pipeline([('vectorizer',vectorizerCount),('clf',SGDClassifier())])

In [63]:
grid_paramsSGD = {
               #'vectorizer': [CountVectorizer(), TfidfVectorizer()],
               'vectorizer__min_df':[0,1,10,20,30],
               'vectorizer__max_df':[0.9,0.80,0.7,1.0],
             #'vectorizer__stop_words':['english',stopwords.words('english')],

              #'clf': [LogisticRegreion()],
               'clf__loss': [ 'hinge' ],
               'clf__penalty': ['l2','l1'],
               'clf__alpha': [0.0001,0.001,0.01,0.1,1],  
               'clf__learning_rate':['optimal','invscaling'],
               'clf__class_weight': ['balanced',None],
               'clf__eta0': [0.01]
              }

In [64]:
gridSGD = GridSearchCV(pipeSGD, cv=3,scoring='accuracy', param_grid=grid_paramsSGD, n_jobs=-1)

In [None]:
gridSGD.fit(train_text, train_label)

In [None]:
gridSGD.best_score_

### Random Forest

In [267]:
#Pipeline for Random Forest
pipeRF = Pipeline([('vectorizer',vectorizerCountLem),('clf',RandomForestClassifier())])


In [272]:
#Prameters
grid_paramsRF = {
               #'vectorizer': [CountVectorizer(), TfidfVectorizer()],
               'vectorizer__min_df':[0,1,10,20,30],
               'vectorizer__max_df':[0.9,0.80,0.7,1.0],
               'vectorizer__ngram_range':[(1,2)],
               #'vectorizer__stop_words':['english',stopwords.words('english')],

                'clf__n_estimators': [1000],
                'clf__max_features': ['sqrt'],
                #'clf__min_samples_split': [2,3],
                #'clf__min_samples_leaf':[2,3],
                'clf__class_weight':['balanced']
               };
              
             

In [273]:
#GridDearch
gridRF = GridSearchCV(pipeRF, cv=3, scoring='accuracy', param_grid=grid_paramsRF, n_jobs=-1)

In [274]:
#Train Random Forest
gridRF.fit(train_text, train_label)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer=<function lemmatized_words at 0x7f8310224bf8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), prep...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'vectorizer__min_df': [0, 1, 10, 20, 30], 'vectorizer__max_df': [0.9, 0.8, 0.7, 1.0], 'vectorizer__ngram_range': [(1, 2)], 'clf__n_estimators': [1000], 'clf__max_features': ['sqrt'], 'clf__class_weight': ['balanced']},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [276]:
#Score
gridRF.best_score_

0.759

In [79]:
#Estimator
gridRF.best_estimator_

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer=<function stemmed_words at 0x7fc78bc6dd90>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=0.8,
        max_features=None, min_df=1, ngram_range=(1, 2), preproc...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [78]:
AnswerWrite(gridRF.best_estimator_.predict(test_text),"test8.csv")