In [27]:
import requests
import bs4
import pandas as pd
import re
import numpy as np
import pandas as pd
import re as regex
from pymystem3 import Mystem
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import  LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn import manifold

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
#Function for writing to file

def AnswerWrite(prediction, filename):
  #np.savetxt('np.csv', a, fmt='%.2f', delimiter=',', header=" #1,  #2,  #3,  #4")
  answer = np.vstack([[i for i in np.arange(0, len(prediction),1)], prediction])
  np.savetxt(filename, answer.T, fmt="%s", delimiter=',', header="Id,y")

In [20]:
with open('/home/doniyor/Final_Project/Part6/test.csv') as f:
    s = f.read()
    test_final = pd.DataFrame({'Review':[item.text for item in bs4.BeautifulSoup(s,'lxml').findAll('review')]})

### Load data

In [4]:
df = pd.read_csv('/home/doniyor/Final_Project/Part6/Ispytano_dataset.csv')

In [5]:
df.head()

Unnamed: 0,Review,class
0,"1.Не плохо выглядит внешне, пока не работает ...",positive
1,"+ Безусловно, цена.+ Наличие HDMI, подключал ...",positive
2,Картинка хорошая. На этом все достоинства зак...,positive
3,понравилось соответствие размеров экрана и це...,positive
4,"высокое разрешение экрана, можно подключить к...",positive


### Cleaning
 * #### From special symbols

In [6]:
def clean_symbols(data,column='Review'):
    for remove in map(lambda r: regex.compile(regex.escape(r)), [",", " : ", "\"", "=", "&", ";", "%", "$",
                                                                     "@", "%", "^", "*", "{", "}",
                                                                     "[", "]", "|", "/", "\\", ">", "<", "-",
                                                                     #"!",
                                                                     "?", ".", "'",
                                                                     "--", "---", "#",
                                                                 "(", ")",
                                                                 ]):
            data.loc[:, "Review"].replace(remove, " ", inplace=True)    
    return data
def remove_by_regex(data ,regexp):
        data.loc[:, "Review"].replace(regexp, " ", inplace=True)
        return data
def remove_urls(data):
        return remove_by_regex(data, regex.compile(r"http.?://[^\s]+[\s]?"))
    
def remove_usernames(data):
        return remove_by_regex(data, regex.compile(r"@[^\s]+[\s]?"))
def remove_numbers(data):
        return remove_by_regex(data, regex.compile(r"\s?[0-9]+\.?[0-9]*"))
                                  

In [7]:
def clean_full(data):
    data = clean_symbols(data)
    data = remove_urls(data)
    data = remove_usernames(data)
    data = remove_numbers(data)
    return data

In [8]:
data = clean_full(df)

In [9]:
data.head()

Unnamed: 0,Review,class
0,Не плохо выглядит внешне пока не работает ...,positive
1,+ Безусловно цена + Наличие HDMI подключал ...,positive
2,Картинка хорошая На этом все достоинства зак...,positive
3,понравилось соответствие размеров экрана и це...,positive
4,высокое разрешение экрана можно подключить к...,positive


* #### Lemmatization

In [10]:
lemmatizer = Mystem()
analyzerLem = CountVectorizer().build_analyzer()

In [11]:
def lemmatized_words(frame):
        #arr = []
        #for w in analyzerLem(frame):
            #prepare = pos_prep(w)
           # arr.append(lemmatizer.lemmatize(w))
            
        return lemmatizer.lemmatize(frame)

In [12]:
# CountVectorizer
vectorizerCountLem = CountVectorizer(analyzer=lemmatized_words);

In [13]:
#Prepare train and test datatest
X_train, X_test, y_train, y_test = train_test_split(data['Review'], data['class'], test_size = 0.20, random_state=42)


### Logistic Regression

In [14]:
clflog = LogisticRegression();

In [16]:
pipe = Pipeline([('vectorizer',vectorizerCountLem),('clf', clflog)])

In [17]:
#GridSearch parameters
grid_params =     {
     'vectorizer__min_df':[10],
     'vectorizer__max_df':[500],
     'vectorizer__analyzer':['word'],
     'vectorizer__ngram_range':[(1,2)],
     #'vectorizer__stop_words':[nltk.corpus.stopwords.words('english2')],
    
      'clf__C': [1.0,0.5,0.7,0.9,0.25,0.1],
      'clf__penalty': ['l2','l1'],
      'clf__class_weight': ['balanced']       
    }
    

In [17]:
#GridSearchCV
grid = GridSearchCV(pipe, cv=3,scoring='accuracy', param_grid=grid_params, n_jobs=-1)

In [19]:
model = pipe.fit(X_train,y_train)

In [22]:
accuracy_score(model.predict(X_test), y_test)

0.9251930501930502

In [23]:
res = model.predict(test_final['Review'])
res = ['neg' if (i == 'negative') else 'pos' for i in res]

In [24]:
AnswerWrite(prediction=res, filename='resSentiment2.csv')

### Try dimention reduction

In [18]:
data_count = vectorizerCountLem.fit_transform(X_train)

In [19]:
# 21638 features
data_count.shape

(16573, 21638)

In [24]:
#Projecting dataset to lower dimensions
svd2000 = TruncatedSVD(n_components=2000,  random_state=42)
data_2000 = svd2000.fit_transform(data_count)

In [23]:
svd1000 = TruncatedSVD(n_components=1000,  random_state=42)
data_1000 = svd1000.fit_transform(data_count)

In [22]:
svd500 = TruncatedSVD(n_components=500, random_state=42)
data_500 = svd500.fit_transform(data_count)

In [20]:
svd100 = TruncatedSVD(n_components=100,  random_state=42)
data_100 = svd100.fit_transform(data_count)

In [25]:
print("Explained 100: ", svd100.explained_variance_ratio_.sum(),
      "| Explained 500: ", svd500.explained_variance_ratio_.sum(),
      "| Explained 1000: ", svd1000.explained_variance_ratio_.sum(),
      "| Explained 2000: ", svd2000.explained_variance_ratio_.sum()) 

Explained 100:  0.980707425451 | Explained 500:  0.99105928908 | Explained 1000:  0.994851446009 | Explained 2000:  0.997618659644


* Logistik regression

In [33]:
#Pipeline for 100 features
pipelog_svd = Pipeline([('vectorizer',vectorizerCountLem),('svd', svd100),('clf',clflog)])

In [35]:
log_cr = cross_val_score(estimator=pipelog_svd,X=X_train,y=y_train, cv=3)

In [38]:
pipelog_svd.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer=<function lemmatized_words at 0x7f3b04922158>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), prep...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [39]:
log_acc = accuracy_score(pipelog_svd.predict(X_test), y_test)

In [40]:
print("Cross val mean logistic regression: ",log_cr.mean(), " | Validation on test set: ", log_acc)

Cross val mean logistic regression:  0.869244934759  | Validation on test set:  0.877654440154


* Random Forest

In [52]:
#Pipeline for Random Forest on 500 features
clfRF = RandomForestClassifier(n_estimators=1000)
pipeRF = Pipeline([('vectorizer',vectorizerCountLem),('svd', svd500),('clf',clfRF)])

In [None]:
rf_cr = cross_val_score(estimator=pipeRF,X=X_train,y=y_train, cv=3)

In [None]:
pipeRF.fit(X_train, y_train)
rf_acc = accuracy_score(pipeRF.predict(X_test), y_test)

In [None]:
print("Cross val mean random forest: ",rf_cr.mean(), " | Validation on test set: ", rf_acc)

### Visualization
* MDS

In [None]:
mds = manifold.MDS(n_components = 2, n_init = 1, max_iter = 1000)
data_2d_mds = mds.fit_transform(data_100)

In [None]:
pylab.figure(figsize=(10, 6))
pylab.scatter(data_2d_mds[:, 0], data_2d_mds[:, 1], c = labels)

### Conclusion
#### Best result is 94% with Logistic Regression with default values