In [7]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import gensim
import numpy as np
from nltk.util import skipgrams
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import SGDClassifier 
from sklearn.linear_model import LassoCV
from sklearn.naive_bayes import BernoulliNB, BaseNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from nltk.corpus import stopwords

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import nltk.data
nltk.download('popular')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [0]:
dpred= pd.read_csv("/content/gdrive/My Drive/NLP/works/wrk01_sentiment/product-reviews-sentiment-analysis-light/products_sentiment_test.tsv",sep="\t")

dtrain = pd.read_csv(
    "/content/gdrive/My Drive/NLP/works/wrk01_sentiment/product-reviews-sentiment-analysis-light/products_sentiment_train.tsv", 
    sep="\t",
    header=None,
    names=["text", "response"]
)

cv = RepeatedKFold(n_repeats=4, n_splits=5)

# Simple Vectorizing

In [66]:
pipe = Pipeline(
    [
        ('vec', CountVectorizer(stop_words="english")),
        ('clf', LogisticRegression())
    ]
)

scores = cross_val_score(pipe, dtrain.text, dtrain.response, cv=cv)
print(f"avg={scores.mean()} std={scores.std()}")

avg=0.7466249999999999 std=0.012406525500719353


In [69]:
pipe = Pipeline(
    [
        ('vec', TfidfVectorizer()),#stop_words="english")),
        ('clf', LogisticRegression())
    ]
)

scores = cross_val_score(pipe, dtrain.text, dtrain.response, cv=RepeatedKFold(n_repeats=4, n_splits=10))
print(f"avg={scores.mean()} std={scores.std()}")

avg=0.7725 std=0.03426003502625181


In [0]:
# for feature analysis
# dtrain['pred'] = pred[:, 1]
# dtrain['aerr'] = abs(dtrain.pred - dtrain.response)

# dtrain.sort_values(by='aerr', ascending=False)

# pipe.fit(dtrain.text, dtrain.response)

# vec = pipe.named_steps['vec']
# res = vec.transform([dtrain.text[15]]).todense()
# list(filter(lambda x: x[1]!=0, zip(vec.get_feature_names(),res.tolist()[0])))

# def get_wei(word):
#     word_idx = pipe.named_steps['vec'].vocabulary_.get(word, None)
#     return pipe.named_steps['clf'].coef_[0][word_idx] if word_idx is not None else None

# txt = dtrain.text[488].split()
# vec = pipe.named_steps['vec'].transform([dtrain.text[15]]).todense()
# dict(zip(txt,list(map(get_wei, txt))))


In [77]:
pipe = Pipeline(
    [
        ('vec', TfidfVectorizer()),#stop_words="english")),
        ('clf', BernoulliNB())
    ]
)

scores = cross_val_score(pipe, dtrain.text, dtrain.response, cv=RepeatedKFold(n_repeats=4, n_splits=10))
print(f"avg={scores.mean()} std={scores.std()}")

avg=0.75 std=0.030145480589965735


# Word2Vec


In [0]:
model = gensim.models.KeyedVectors.load_word2vec_format(
    "/content/gdrive/My Drive/NLP/models/GoogleNews-vectors-negative300.bin.gz", 
    binary=True
)  

In [0]:
vec = TfidfVectorizer()

In [0]:
X = vec.fit_transform(dtrain.text)

In [24]:
d2v = []
for irow, row in enumerate(range(X.shape[0])):
  doc = np.zeros(model.vector_size)
  for col in X[row].indices:
    word = vec.get_feature_names()[col]
    n = 0
    if word in model:
      doc+=X[row, col]*model.get_vector(word)
      n+=1
#     if n!=0:
#       doc/=float(n)
  if irow%1000==0:
    print(irow)
  d2v.append(doc)


0
1000


In [68]:
# [0.83268482, 0.82397004, 0.80885312, 0.81214421, 0.85928705,
#        0.80230326, 0.8627451 , 0.80078895, 0.84130019, 0.81422925,
#        0.78823529, 0.84727273, 0.79919679, 0.84351145, 0.83908046,
#        0.83629191, 0.81335953, 0.79846449, 0.83992467, 0.83082707]

est = LogisticRegression()
cross_val_score(est, d2v, dtrain.response, cv=cv)

array([0.795 , 0.755 , 0.74  , 0.7975, 0.7475, 0.755 , 0.7875, 0.775 ,
       0.795 , 0.78  , 0.7725, 0.76  , 0.7825, 0.765 , 0.7375, 0.77  ,
       0.7825, 0.8   , 0.7975, 0.755 ])

In [32]:
import xgboost as xgb

par = {
  "silent": False, 
  "scale_pos_weight": 1,
  "learning_rate": 0.001,  
  "colsample_bytree": 0.4,
  "subsample": 0.6,
#   "objective": 'binary:logistic', 
  "n_estimators":200, 
  "reg_alpha": 0.7,
  "max_depth": 4, 
  "gamma":0
}


# par = {
#   "silent": [False], 
#   "scale_pos_weight": [1],
#   "learning_rate": [0.01],  
#   "colsample_bytree": [0.4],
#   "subsample": [0.8],
#   "objective": ['binary:logistic'], 
#   "n_estimators":[1000], 
#   "reg_alpha": [0.3],
#   "max_depth":[4], 
#   "gamma":[10]
# }

est = xgb.XGBClassifier(**par)
cross_val_score(est, np.array(d2v), dtrain.response, cv=cv, scoring='f1')

array([0.77429984, 0.8370607 , 0.78088962, 0.81210191, 0.79099678,
       0.82792208, 0.78836834, 0.7987013 , 0.77723577, 0.7968    ,
       0.82009724, 0.7875817 , 0.78431373, 0.81361426, 0.77943615,
       0.80452342, 0.80511182, 0.78964401, 0.80322581, 0.79344262])

In [33]:
from lightgbm import LGBMClassifier

est = LGBMClassifier()
cross_val_score(est, np.array(d2v), dtrain.response, cv=cv, scoring='f1')

array([0.84916201, 0.83754513, 0.80373832, 0.82998172, 0.80926916,
       0.85130112, 0.81121495, 0.81343284, 0.81272085, 0.8383659 ,
       0.79924242, 0.84587814, 0.82899628, 0.82481752, 0.84436494,
       0.81588448, 0.83685221, 0.81886792, 0.82060391, 0.82709447])

In [0]:
xgb_pred = cross_val_predict(xgb.XGBClassifier(**par), np.array(d2v), dtrain.response)

In [0]:
lgb_pred = cross_val_predict(LGBMClassifier(), np.array(d2v), dtrain.response)

In [0]:
lr_pred = cross_val_predict(LogisticRegression(), np.array(d2v), dtrain.response)

In [0]:
rf_pred = cross_val_predict(RandomForestClassifier(n_estimators=300), np.array(d2v), dtrain.response)

In [0]:
pipe = Pipeline(
    [
        ('vec', TfidfVectorizer()),#stop_words="english")),
        ('clf', LogisticRegression())
    ]
)

ti_lr = cross_val_predict(pipe, dtrain.text, dtrain.response)

In [0]:
dtest = pd.DataFrame({
    "xgb":xgb_pred,
    "lgb":lgb_pred,
    'lr':lr_pred,
    "rf":rf_pred,
    "ti_lr": ti_lr
})

In [96]:
f1_score(
#     y_pred=dtest.mean(axis=1).apply(round), #0.8237037037037037
#     y_pred=dtest[["xgb", "lgb"]].mean(axis=1).apply(round), #0.8209226298583364
    y_pred=dtest[["xgb", "lgb", "lr", "rf"]].mean(axis=1).apply(round), #0.8209226298583364
    y_true=dtrain.response
)

0.8311317386647625

In [103]:
accuracy_score(
    y_pred=dtest[["lgb", "lr", "ti_lr"]].mean(axis=1).apply(round), 
    y_true=dtrain.response
)

0.7795

# PREDICT

## vesrion 1

In [0]:
pipe = Pipeline(
    [
        ('vec', TfidfVectorizer(stop_words="english")),
        ('clf', LogisticRegression())
    ]
)

pipe.fit(dtrain.text, dtrain.response)
dresult = pd.DataFrame({
    "Id": dpred.Id,
    "y": pipe.predict(dpred.text)
})
dresult.to_csv("tfidf_lr.csv", index=None)

## version 2

In [86]:
vec_pred = TfidfVectorizer()
x_pred = vec.fit_transform(dpred.text)

d2v_pred = []
for irow, row in enumerate(range(x_pred.shape[0])):
  doc = np.zeros(model.vector_size)
  for col in x_pred[row].indices:
    word = vec.get_feature_names()[col]
    n = 0
    if word in model:
      doc+=x_pred[row, col]*model.get_vector(word)
      n+=1
#     if n!=0:
#       doc/=float(n)
  if irow%1000==0:
    print(irow)
  d2v_pred.append(doc)
  


0


In [0]:
xg_c = xgb.XGBClassifier(**par)
xg_c.fit(np.array(d2v), dtrain.response)
xg_res = xg_c.predict_proba(np.array(d2v_pred))

In [0]:
lg_c = LGBMClassifier()
lg_c.fit(np.array(d2v), dtrain.response)
lg_res = lg_c.predict_proba(np.array(d2v_pred))

In [0]:
rf_c = RandomForestClassifier(n_estimators=100)
rf_c.fit(np.array(d2v), dtrain.response)
rf_res = rf_c.predict_proba(np.array(d2v_pred))

In [0]:
lr_c = LogisticRegression()
lr_c.fit(np.array(d2v), dtrain.response)
lr_res = rf_c.predict_proba(np.array(d2v_pred))

In [0]:
pipe = Pipeline(
    [
        ('vec', TfidfVectorizer(stop_words="english")),
        ('clf', LogisticRegression())
    ]
)
pipe.fit(dtrain.text, dtrain.response)
ti_lr = pipe.predict_proba(dpred.text)

In [0]:
dresult = pd.DataFrame({
    "Id": dpred.Id,
    "lg": lg_res[:, 1],
    "lr": lr_res[:, 1],
    "ti_lr": ti_lr[:, 1]
})

In [0]:
dresult["y"] = dresult[["lg", "lr", "ti_lr"]].mean(axis=1).apply(round)

In [0]:
dresult[["Id", "y"]].to_csv("HUSRADAAA.csv", index=None)

In [111]:
ti_lr.shape

(500,)