<a href="https://colab.research.google.com/github/purgen219/Text_classification/blob/master/fasttext.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lib

In [0]:
#!pip install imblearn
#!pip install fasttext
#!pip install gensim

#!wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip

#!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz
#!gzip -d cc.ru.300.vec.gz

In [0]:
import pandas as pd
import numpy as np
import scipy.stats as sc


import sklearn.model_selection
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

import fasttext
import fasttext.util

import re
import imblearn
from imblearn.over_sampling import RandomOverSampler
import gensim.downloader 
from gensim.models import KeyedVectors
import nltk

from xgboost import XGBClassifier
import xgboost as xgb
import xgboost as xgb

import time

# Data

In [20]:
data = pd.read_csv('PsyHack_RUDN_train.csv', sep='\t' , encoding= 'utf-8')
data['label'] = data['label'] == 'Depression'
data.head(10)

Unnamed: 0,ID,label,text
0,307,False,С детства я люблю футбол. Самые теплые воспоми...
1,243,False,Каждому человеку нужен мир. Мир- это весь земн...
2,73,False,"Я, другие и мир. Да, именно в таком порядке ме..."
3,301,False,Позиционирование себя как полноценного человек...
4,60,False,"Я, другие и мир. Такая неопределенная тема. Мы..."
5,128,False,К каждому человеку я отношусь изначально позит...
6,157,True,"«Я, другие, мир».Все меняется в этом мире. Мен..."
7,294,True,"Приветсвую, меня зовут Никита. Мне 20 полных л..."
8,67,False,В данном эссе я попытаюсь дать оценку своему х...
9,117,False,"Как много всего нас окружает, и как мало мы за..."


In [21]:
X = data['text']
y = data['label']
X_train, X_test , y_train,y_test = sklearn.model_selection.train_test_split(X,y,
                                                                            test_size=0.33,
                                                                            random_state=42)
ros = RandomOverSampler(random_state=0)
X_train_ros, y_train_ros = ros.fit_resample(pd.DataFrame(X_train),
                                            pd.DataFrame(y_train))

  y = column_or_1d(y, warn=True)


In [0]:
def text_prep(text):
    text = str(text).lower()
    text = re.sub(r'([\'\"\.\(\)\!\?\-\\\/\,])',r' \1 ',text)
    return text

In [0]:
def data_preproc_for_fasttext(data):
    text_result = ''
    for index, row in data.iterrows():
        label = "__label__" + str(row['label']) + ' '
        text = text_prep(row['text'])
        text_result += label+text+' \n '
    return text_result

In [0]:
def write_to_file(name_txt,data_):
    data_for_ft = data_preproc_for_fasttext(data_)
    text_file = open(name_txt, "w")
    text_file.write(data_for_ft)
    text_file.close()

In [0]:
def roc_auc(df_test,model):
    y_pred = []
    y_true = df_test['label'] 
    for index,row in df_test.iterrows():
        row = text_prep(row['text'])
        predict = model.predict(row)[0][0]
        if(predict == '__label__' + 'True'):
            y_pred.append(1)
        elif(predict == '__label__' + 'False'):
            y_pred.append(0)
        else:
            assert(1 == 0)
    #print(np.array(y_pred),np.array(y_true))
    return roc_auc_score(y_true, y_pred)

# Models

## Fasttext classification(without oversampling)


In [0]:
#split data 
data_train, data_test = sklearn.model_selection.train_test_split(data,
                                                                 test_size=0.33,
                                                                 random_state=42)
#write to file for ft
write_to_file("data_train_for_ft.txt", data_train)
write_to_file("data_test_for_ft.txt", data_test)

In [0]:
#Train the model
model_ft = fasttext.train_supervised(input='data_train_for_ft.txt', 
                                  autotuneValidationFile = "data_test_for_ft.txt")

In [44]:
#test roc_auc
roc_auc(data_test,model_ft)

0.7401960784313725

## fasttext classification(with oversampling)

In [0]:
data_test_ros = pd.concat([y_test, X_test], axis = 1)
data_train_ros = pd.concat([pd.DataFrame(y_train_ros, columns=['label']),
                            pd.DataFrame(X_train_ros, columns=['text'])], axis = 1)

write_to_file("data_train_ros_for_ft.txt", data_train_ros)
write_to_file("data_test_ros_for_ft.txt", data_test_ros)

In [0]:
model_ft_ros = fasttext.train_supervised(input='data_train_ros_for_ft.txt', 
                                      autotuneValidationFile = "data_test_ros_for_ft.txt")

In [47]:
roc_auc(data_test_ros,model_ft_ros)

0.7887700534759358

## Embedding fot text and some classic models (with oversampling)

In [18]:
#load w2w emb for words
ru_emb = KeyedVectors.load_word2vec_format("cc.ru.300.vec")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### Sum w2w emb and some classic models

In [0]:
def vectorize_sum_w2w(text):
    """
    implement a function that converts preprocessed comment to a sum of token vectors
    """
    tokenizer = nltk.tokenize.WordPunctTokenizer()
    list_of_tokens = tokenizer.tokenize(text_prep(text))

    features = np.zeros(300)
    
    for token in list_of_tokens:
      if token in ru_emb.vocab:
        features += ru_emb[token]
      else:
        pass

    return features

In [0]:
X_train_ros_w2v = np.stack([vectorize_sum_w2w(text) for text in X_train_ros])
X_test_w2v = np.stack([vectorize_sum_w2w(text) for text in X_test])

#### RandomForestClassifier()

In [64]:
rf = RandomForestClassifier()
rf.fit(X_train_ros_w2v,y_train_ros)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
predict = rf.predict(X_test_w2v)

In [66]:
roc_auc_score(y_test,predict)
#we got a good result, so let's try to choose the parameters

0.8114973262032085

##### GridSearchCV()

In [80]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train_ros_w2v,y_train_ros)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   52.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.7min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [82]:
rf_random.best_params_

{'bootstrap': False,
 'max_depth': 80,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 155}

In [88]:
rf_random.best_estimator_.fit(X_train_ros_w2v,y_train_ros)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=80, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=155,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [89]:
predict = rf_random.best_estimator_.predict(X_test_w2v)
roc_auc_score(y_test,predict)
#good model

0.8342245989304813

### Sum w2w emb with idf coef and some classic models

In [0]:
tfidf = TfidfVectorizer(lowercase=False)
tfidf.fit_transform(X_train)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

def vectorize_sum_w2w_idf(text):
    """
    implement a function that converts preprocessed comment to a sum of token vectors
    """
    tokenizer = nltk.tokenize.WordPunctTokenizer()
    list_of_tokens = tokenizer.tokenize(text_prep(text))

    features = np.zeros(300)
    
    for token in list_of_tokens:
      if token in ru_emb.vocab:
        if token in word2tfidf:
          idf = word2tfidf[token]
        else:
          idf = 0
        features += idf *np.array(ru_emb[token])
      else:
        pass

    return features


In [0]:
X_train_ros_w2v_idf = np.stack([vectorize_sum_w2w_idf(text) for text in X_train_ros])
X_test_w2v_idf = np.stack([vectorize_sum_w2w_idf(text) for text in X_test])

#### RandomForestClassifier

In [92]:
wv_idf_model = RandomForestClassifier()
wv_idf_model.fit(X_train_ros_w2v_idf,y_train_ros)
predict = wv_idf_model.predict(X_test_w2v_idf)
roc_auc_score(y_test,predict)
#the result with idf turned out worse

0.7009803921568628

# RandomSearchCV(xgb) with only sum emb (without idf coef) and oversamling

In [98]:
%%time

params = {
        "learning_rate":sc.uniform(0.05,0.3),
        'max_depth': sc.randint(2,15),
        'n_estimators' : sc.randint(5,500),
        'min_child_weight' : [ 1, 3, 5, 7 ],
        'gamma': sc.uniform(0.0,0.5)
        }

x_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', eta=0.02, n_jobs=-1)

xgb_random_search = RandomizedSearchCV(x_model, param_distributions = params,
                                       n_iter=30, scoring = 'neg_log_loss', 
                                       n_jobs = -1, cv=3, verbose=10)
xgb_random_search.fit(X_train_ros_w2v,y_train_ros)


Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   10.5s finished


CPU times: user 1.76 s, sys: 40.9 ms, total: 1.8 s
Wall time: 10.9 s


In [0]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = xgb_random_search.best_params_['max_depth']
params['n_estimators'] = xgb_random_search.best_params_['n_estimators']
params['learning_rate'] = xgb_random_search.best_params_['learning_rate']
params['min_child_weight'] = xgb_random_search.best_params_['min_child_weight']
params['gamma'] = xgb_random_search.best_params_['gamma']

d_train = xgb.DMatrix(X_train_ros_w2v, label= y_train_ros)
d_test = xgb.DMatrix(X_test_w2v, label = y_test)


In [100]:
bst = xgb.train(params, d_train)

predict_y = bst.predict(d_test)
roc_auc_score(y_test,predict_y)
#Grate model

0.8912655971479502

In [0]:
def make_sub(raw_test_data,model1,model2):
  return (model1.predict(raw_test_data),
          model2.predict(raw_test_data))

In [104]:
%%bash
git init
git config -- global user.email “kumeiko.se@phystech.edu”
git config -- global user.name “purgen219”

git add fasttext.ipynb
git commit -m 'First commit'
git remote add origin https://github.com/purgen219/Text_classification.git

git push -u origin master

Reinitialized existing Git repository in /content/.git/


error: key does not contain a section: global
error: key does not contain a section: global
fatal: pathspec 'fasttext.ipynb' did not match any files

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@2c1e39bbe0f7.(none)')
fatal: remote origin already exists.
error: src refspec master does not match any.
error: failed to push some refs to 'https://github.com/purgen219/Text_classification.git'
