In [1]:
import pandas as pd
import nltk
import numpy as np
import string
from scipy.sparse import csr_matrix
from nltk.tokenize import  word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm.notebook import tqdm
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from spellchecker import SpellChecker
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
pip instanltk.download('wordnet')

zsh:1: unknown sort specifier
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install pyspellchecker

Looking in indexes: http://token:****@sberosc.sigma.sbrf.ru/repo/pypi/simple
Note: you may need to restart the kernel to use updated packages.


In [4]:
RANDOM_SEED = 21

In [5]:
df_negative = pd.read_csv('./data/processedNegative.csv').T.reset_index()
df_negative['class'] = 0

In [6]:
df_neutral = pd.read_csv('./data/processedNeutral.csv').T.reset_index()
df_neutral['class'] = 1

In [7]:
df_possitive = pd.read_csv('./data/processedPositive.csv').T.reset_index()
df_possitive['class'] = 2

In [8]:
df = df_negative.append(df_possitive).append(df_neutral)
df.reset_index(drop=True,inplace=True)
df.columns = ['text', 'class']

# Preprocesing

In [9]:
stop_words = nltk.corpus.stopwords.words('english')

In [10]:
text = ' '.join(df.text.map(str.lower).map(str.split).map(lambda x: [i for i in x if i not in stop_words]).map(' '.join).values)

In [11]:
text = text.translate(str.maketrans('', '', string.punctuation))

In [12]:
df.text = df.text\
.map(str.lower) \
.map(lambda x : x.translate(str.maketrans('', '', string.punctuation))) \
.map(str.split) \
.map(lambda x: [i for i in x if i not in stop_words]) \
.map(' '.join)

In [13]:
df.head()

Unnamed: 0,text,class
0,unhappy dogs like though,0
1,talking driver im goinghe said hed love go new...,0
2,anybody know rands likely fall dollar got mone...,0
3,miss going gigs liverpool unhappy,0
4,isnt new riverdale tonight unhappy,0


In [14]:
index_train, index_test =  map(lambda x: x.index, train_test_split(df.loc[:,'class'],stratify=df.loc[:,'class'], random_state=RANDOM_SEED))

In [15]:
index_train, index_val = map(lambda x: x.index ,train_test_split(df.loc[index_train,'class'], 
                                                                 stratify=df.loc[index_train, 'class'], 
                                                                 test_size=0.2,
                                                                )
                            )

# JUST TOKENIZATIONS

In [16]:
X, y = df['text'].copy(), df['class']

In [17]:
idx = index_train.append(index_val)

In [18]:
token_df_train = pd.Series(df.loc[idx,'text']).str.get_dummies(sep=' ')

In [19]:
token_df_train.head()

Unnamed: 0,0,000,000019,014736,04,041017,0570,0700am,0845am,09,...,yoyoure,yoyoyou,yummy,yura,yuri,zac,zcc,zero,zoo,zplus
565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
216,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
823,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2815,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2758,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
token_df_test = pd.Series(df.loc[index_test,'text']).str.get_dummies(sep=' ')

In [21]:
token_df_test = token_df_test.loc[:, set(token_df_train.columns).intersection(set(token_df_test))]

In [22]:
token_df_test.loc[:, set(token_df_train.columns) - set(token_df_train.columns).intersection(set(token_df_test.columns))] = 0

In [23]:
token_df_test

Unnamed: 0,till,arrived,south,tired,challenged,ha,ministry,remember,isnt,chain,...,sajeda,couldnt,buys,baron,dahildahil,karl,meds,controversy,lifetime,chaudhuri
2831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## CountVectorizer

In [24]:
count_vectorizer = CountVectorizer()

In [25]:
count_vectorizer.fit(df.loc[idx, 'text'])

CountVectorizer()

In [26]:
count_df_train = pd.DataFrame(data = count_vectorizer.transform(df.loc[idx,'text']).toarray(),
                              columns=count_vectorizer.get_feature_names(), 
                              index=idx
                             )

In [27]:
count_df_test = pd.DataFrame(data = count_vectorizer.transform(df.loc[index_test,'text']).toarray(),
                             columns=count_vectorizer.get_feature_names(), 
                             index=index_test,
                            )

In [28]:
count_df_test

Unnamed: 0,000,000019,014736,04,041017,0570,0700am,0845am,09,0xx9,...,yoyoure,yoyoyou,yummy,yura,yuri,zac,zcc,zero,zoo,zplus
2831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TFIDFVectorizer

In [29]:
tfidf_vectorizer =  TfidfVectorizer()

In [30]:
tfidf_vectorizer.fit(df.loc[idx, 'text'])

TfidfVectorizer()

In [31]:
tfidf_df_train = pd.DataFrame(data = tfidf_vectorizer.transform(df.loc[idx,'text']).toarray(),
                              columns=tfidf_vectorizer.get_feature_names(), 
                              index=idx
                             )

In [32]:
tfidf_df_test = pd.DataFrame(data = tfidf_vectorizer.transform(df.loc[index_test,'text']).toarray(),
                              columns=tfidf_vectorizer.get_feature_names(), 
                              index=index_test
                             )

## Classification

In [33]:
bayes_class = MultinomialNB()
logreg = LogisticRegression() 

In [34]:
# index_train, index_val = map(lambda x: x.index ,train_test_split(df.loc[index_train,'class'], 
#                                                                  stratify=df.loc[index_train, 'class'], 
#                                                                  test_size=0.2,
#                                                                 )
#                             )

### Token

In [35]:
token_df_train

Unnamed: 0,0,000,000019,014736,04,041017,0570,0700am,0845am,09,...,yoyoure,yoyoyou,yummy,yura,yuri,zac,zcc,zero,zoo,zplus
565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
216,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
823,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2815,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2758,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2449,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3833,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2781,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
X_train, X_test, y_train, y_test = token_df_train.loc[index_train], token_df_train.loc[index_val], df.loc[index_train, 'class'], df.loc[index_val, 'class']
bayes_class.fit(X_train, y_train)
accuracy_score(y_test, bayes_class.predict(X_test))

0.8605851979345955

In [37]:
X_train, X_test, y_train, y_test = token_df_train.loc[index_train], token_df_train.loc[index_val], df.loc[index_train, 'class'], df.loc[index_val, 'class']
logreg.fit(X_train, y_train)
accuracy_score(y_test, logreg.predict(X_test))

0.8743545611015491

In [38]:
print("Результат на тестовом наборе данных:")
print(f"Линейная регрессия - {accuracy_score(df.loc[index_test, 'class'], logreg.predict(token_df_test))}")
print(f"Байес - {accuracy_score(df.loc[index_test, 'class'], bayes_class.predict(token_df_test))}")

Результат на тестовом наборе данных:
Линейная регрессия - 0.4169246646026832
Байес - 0.3498452012383901


### Count

In [39]:
X_train, X_test, y_train, y_test = count_df_train.loc[index_train], count_df_train.loc[index_val], df.loc[index_train, 'class'], df.loc[index_val, 'class']

In [40]:
logreg.fit(X_train, y_train)
accuracy_score(y_test, logreg.predict(X_test))

0.8760757314974182

In [41]:
bayes_class.fit(X_train, y_train)
accuracy_score(y_test, bayes_class.predict(X_test))

0.8588640275387264

In [42]:
print("Результат на тестовом наборе данных:")
print(f"Линейная регрессия - {accuracy_score(df.loc[index_test, 'class'], logreg.predict(count_df_test))}")
print(f"Байес - {accuracy_score(df.loc[index_test, 'class'], bayes_class.predict(count_df_test))}")

Результат на тестовом наборе данных:
Линейная регрессия - 0.8730650154798761
Байес - 0.8452012383900929


### TFIDF

In [43]:
X_train, X_test, y_train, y_test = tfidf_df_train.loc[index_train], tfidf_df_train.loc[index_val], df.loc[index_train, 'class'], df.loc[index_val, 'class']

In [44]:
bayes_class.fit(X_train, y_train)
accuracy_score(y_test, bayes_class.predict(X_test))

0.8709122203098106

In [45]:
logreg.fit(X_train, y_train)
accuracy_score(y_test, logreg.predict(X_test))

0.8674698795180723

In [46]:
print("Результат на тестовом наборе данных:")
print(f"Линейная регрессия - {accuracy_score(df.loc[index_test, 'class'], logreg.predict(tfidf_df_test))}")
print(f"Байес - {accuracy_score(df.loc[index_test, 'class'], bayes_class.predict(tfidf_df_test))}")

Результат на тестовом наборе данных:
Линейная регрессия - 0.8679050567595459
Байес - 0.8617131062951496


### COS_SIMYLARITY

In [47]:
M = cosine_similarity(pd.concat([tfidf_df_train, tfidf_df_test],axis=0))
cos_list = list()
for i, row in enumerate(M):
    for j, value in enumerate(row):
        if i != j:
            cos_list.append(((i, j), value))
cos_list.sort(key=lambda x: x[1],reverse=True)
for i, _ in cos_list[:20]:
    print(X[i[0]],'\nsimilar to\n', X[i[1]], '\n',sep='')

okay survived
similar to
reading hope something great happens smile

reading hope something great happens smile
similar to
okay survived

yuri didnt know unhappy second snsd missed
similar to
groovy thursday happy

reunion august unhappy
similar to
reverse polarisation special epaper link

dont want tell fellow comm students dont want think im kind suckup unhappy
similar to
unhappy lets fight kids

dont want tell fellow comm students dont want think im kind suckup unhappy
similar to
streaminuteg 45 minute understanding thing still unhappy

dont want tell fellow comm students dont want think im kind suckup unhappy
similar to
would sustained fame unhappy

dont want tell fellow comm students dont want think im kind suckup unhappy
similar to
friendly service

dont want tell fellow comm students dont want think im kind suckup unhappy
similar to
thanks recent follow happy connect happy great thursday want

dont want tell fellow comm students dont want think im kind suckup unhappy
similar to


# STEMING

In [48]:
X, y = df['text'].copy(), df['class']

In [49]:
stemmer = PorterStemmer()

In [50]:
X = X.map(lambda x: ' '.join([stemmer.stem(i) for i in x.split()]))

In [51]:
token_df_train = pd.Series(X.loc[idx]).str.get_dummies(sep=' ')

In [52]:
token_df_train.head()

Unnamed: 0,0,000,000019,014736,04,041017,0570,0700am,0845am,09,...,yoyour,yoyoy,yummi,yura,yuri,zac,zcc,zero,zoo,zplu
565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
216,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
823,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2815,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2758,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
token_df_test = pd.Series(X.loc[index_test]).str.get_dummies(sep=' ')

In [54]:
token_df_test = token_df_test.loc[:, set(token_df_train.columns).intersection(set(token_df_test))]

In [55]:
token_df_test.loc[:, set(token_df_train.columns) - set(token_df_train.columns).intersection(set(token_df_test.columns))] = 0

In [56]:
token_df_test

Unnamed: 0,till,south,invit,ha,clash,isnt,chain,bc,captain,friend,...,violat,sajeda,couldnt,baron,dahildahil,karl,sonobuoysps,winthi,flower,figur
2831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## CountVectorizer

In [57]:
count_vectorizer = CountVectorizer()

In [58]:
count_vectorizer.fit(X.loc[idx])

CountVectorizer()

In [59]:
count_df_train = pd.DataFrame(data = count_vectorizer.transform(X.loc[idx]).toarray(),
                              columns=count_vectorizer.get_feature_names(), 
                              index=idx
                             )

In [60]:
count_df_test = pd.DataFrame(data = count_vectorizer.transform(X.loc[index_test]).toarray(),
                             columns=count_vectorizer.get_feature_names(), 
                             index=index_test,
                            )

In [61]:
count_df_test

Unnamed: 0,000,000019,014736,04,041017,0570,0700am,0845am,09,0xx9,...,yoyour,yoyoy,yummi,yura,yuri,zac,zcc,zero,zoo,zplu
2831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TFIDFVectorizer

In [62]:
tfidf_vectorizer =  TfidfVectorizer()

In [63]:
tfidf_vectorizer.fit(X.loc[idx])

TfidfVectorizer()

In [64]:
tfidf_df_train = pd.DataFrame(data = tfidf_vectorizer.transform(X.loc[idx]).toarray(),
                              columns=tfidf_vectorizer.get_feature_names(), 
                              index=idx
                             )

In [65]:
tfidf_df_test = pd.DataFrame(data = tfidf_vectorizer.transform(X.loc[index_test]).toarray(),
                              columns=tfidf_vectorizer.get_feature_names(), 
                              index=index_test
                             )

In [66]:
tfidf_df_train.head()

Unnamed: 0,000,000019,014736,04,041017,0570,0700am,0845am,09,0xx9,...,yoyour,yoyoy,yummi,yura,yuri,zac,zcc,zero,zoo,zplu
565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Classification

In [67]:
bayes_class = MultinomialNB()
logreg = LogisticRegression() 

In [68]:
# index_train, index_val = map(lambda x: x.index ,train_test_split(y.loc[index_train], 
#                                                                  stratify=y.loc[index_train], 
#                                                                  test_size=0.2,
#                                                                 )
#                             )

### Token

In [69]:
token_df_test

Unnamed: 0,till,south,invit,ha,clash,isnt,chain,bc,captain,friend,...,violat,sajeda,couldnt,baron,dahildahil,karl,sonobuoysps,winthi,flower,figur
2831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
X_train, X_test, y_train, y_test = token_df_train.loc[index_train], token_df_train.loc[index_val], y.loc[index_train], y.loc[index_val]
bayes_class.fit(X_train, y_train)
accuracy_score(y_test, bayes_class.predict(X_test))

0.8623063683304647

In [71]:
X_train, X_test, y_train, y_test = token_df_train.loc[index_train], token_df_train.loc[index_val], y.loc[index_train], y.loc[index_val]
logreg.fit(X_train, y_train)
accuracy_score(y_test, logreg.predict(X_test))

0.8691910499139415

In [72]:
print("Результат на тестовом наборе данных:")
print(f"Линейная регрессия - {accuracy_score(y.loc[index_test], logreg.predict(token_df_test))}")
print(f"Байес - {accuracy_score(y.loc[index_test], bayes_class.predict(token_df_test))}")

Результат на тестовом наборе данных:
Линейная регрессия - 0.4066047471620227
Байес - 0.42105263157894735


### Count

In [73]:
X_train, X_test, y_train, y_test = count_df_train.loc[index_train], count_df_train.loc[index_val], df.loc[index_train, 'class'], df.loc[index_val, 'class']

In [74]:
logreg.fit(X_train, y_train)
accuracy_score(y_test, logreg.predict(X_test))

0.8709122203098106

In [75]:
bayes_class.fit(X_train, y_train)
accuracy_score(y_test, bayes_class.predict(X_test))

0.8605851979345955

In [76]:
print("Результат на тестовом наборе данных:")
print(f"Линейная регрессия - {accuracy_score(y.loc[index_test], logreg.predict(count_df_test))}")
print(f"Байес - {accuracy_score(y.loc[index_test], bayes_class.predict(count_df_test))}")

Результат на тестовом наборе данных:
Линейная регрессия - 0.8740970072239422
Байес - 0.8534571723426213


### TFIDF

In [77]:
X_train, X_test, y_train, y_test = tfidf_df_train.loc[index_train], tfidf_df_train.loc[index_val], df.loc[index_train, 'class'], df.loc[index_val, 'class']

In [78]:
bayes_class.fit(X_train, y_train)
accuracy_score(y_test, bayes_class.predict(X_test))

0.8588640275387264

In [79]:
logreg.fit(X_train, y_train)
accuracy_score(y_test, logreg.predict(X_test))

0.8640275387263339

In [80]:
print("Результат на тестовом наборе данных:")
print(f"Линейная регрессия - {accuracy_score(df.loc[index_test, 'class'], logreg.predict(tfidf_df_test))}")
print(f"Байес - {accuracy_score(df.loc[index_test, 'class'], bayes_class.predict(tfidf_df_test))}")

Результат на тестовом наборе данных:
Линейная регрессия - 0.8720330237358102
Байес - 0.8555211558307534


### COS_SIMYLARITY

In [81]:
M = cosine_similarity(pd.concat([tfidf_df_train, tfidf_df_test],axis=0))
cos_list = list()
for i, row in enumerate(M):
    for j, value in enumerate(row):
        if i != j:
            cos_list.append(((i, j), value))
cos_list.sort(key=lambda x: x[1],reverse=True)
for i, _ in cos_list[:20]:
    print(X[i[0]],'\nsimilar to\n', X[i[1]], '\n',sep='')

miss unhappi
similar to
make great point

word never miss articl happi
similar to
molten tar wipe famili

make great point
similar to
miss unhappi

molten tar wipe famili
similar to
word never miss articl happi

becoz depend promot wast hardwork team
similar to
regim chang bulli polic offic

rain hard unhappi
similar to
cute pictur though

yuri didnt know unhappi second snsd miss
similar to
groovi thursday happi

face swap cat dog realli upset unhappi 1
similar to
final next week unhappi wish luck x

realli amount load section unhappi ill ride netherton meet darbi end
similar to
call center nice game 420 unhappi

realli amount load section unhappi ill ride netherton meet darbi end
similar to
would never forget

reunion august unhappi
similar to
revers polaris special epap link

dont want tell fellow comm student dont want think im kind suckup unhappi
similar to
unhappi let fight kid

dont want tell fellow comm student dont want think im kind suckup unhappi
similar to
streaminuteg 45 mi

# LEMMATIZATON

In [82]:
X, y = df['text'].copy(), df['class']

In [83]:
lemmer = WordNetLemmatizer()

In [84]:
lemmer.lemmatize('loving')

'loving'

In [85]:
X = X.map(lambda x: ' '.join([lemmer.lemmatize(i) for i in x.split()]))

In [86]:
token_df_train = pd.Series(X.loc[idx]).str.get_dummies(sep=' ')

In [87]:
token_df_train.head()

Unnamed: 0,0,000,000019,014736,04,041017,0570,0700am,0845am,09,...,yoyoure,yoyoyou,yummy,yura,yuri,zac,zcc,zero,zoo,zplus
565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
216,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
823,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2815,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2758,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [88]:
token_df_test = pd.Series(X.loc[index_test]).str.get_dummies(sep=' ')

In [89]:
token_df_test = token_df_test.loc[:, set(token_df_train.columns).intersection(set(token_df_test))]

In [90]:
token_df_test.loc[:, set(token_df_train.columns) - set(token_df_train.columns).intersection(set(token_df_test.columns))] = 0

In [91]:
token_df_test

Unnamed: 0,till,arrived,south,tired,challenged,clash,ha,ministry,remember,isnt,...,responding,sajeda,couldnt,baron,dahildahil,karl,controversy,bottle,lifetime,chaudhuri
2831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## CountVectorizer

In [92]:
count_vectorizer = CountVectorizer()

In [93]:
count_vectorizer.fit(X.loc[idx])

CountVectorizer()

In [94]:
count_df_train = pd.DataFrame(data = count_vectorizer.transform(X.loc[idx]).toarray(),
                              columns=count_vectorizer.get_feature_names(), 
                              index=idx
                             )

In [95]:
count_df_test = pd.DataFrame(data = count_vectorizer.transform(X.loc[index_test]).toarray(),
                             columns=count_vectorizer.get_feature_names(), 
                             index=index_test,
                            )

In [96]:
count_df_test

Unnamed: 0,000,000019,014736,04,041017,0570,0700am,0845am,09,0xx9,...,yoyoure,yoyoyou,yummy,yura,yuri,zac,zcc,zero,zoo,zplus
2831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TFIDFVectorizer

In [97]:
tfidf_vectorizer =  TfidfVectorizer()

In [98]:
tfidf_vectorizer.fit(X.loc[idx])

TfidfVectorizer()

In [99]:
tfidf_df_train = pd.DataFrame(data = tfidf_vectorizer.transform(X.loc[idx]).toarray(),
                              columns=tfidf_vectorizer.get_feature_names(), 
                              index=idx
                             )

In [100]:
tfidf_df_test = pd.DataFrame(data = tfidf_vectorizer.transform(X.loc[index_test]).toarray(),
                              columns=tfidf_vectorizer.get_feature_names(), 
                              index=index_test
                             )

In [101]:
tfidf_df_train.head()

Unnamed: 0,000,000019,014736,04,041017,0570,0700am,0845am,09,0xx9,...,yoyoure,yoyoyou,yummy,yura,yuri,zac,zcc,zero,zoo,zplus
565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Classification

In [102]:
bayes_class = MultinomialNB()
logreg = LogisticRegression() 

In [103]:
# index_train, index_val = map(lambda x: x.index ,train_test_split(y.loc[index_train], 
#                                                                  stratify=y.loc[index_train], 
#                                                                  test_size=0.2,
#                                                                 )
#                             )

### Token

In [104]:
token_df_test

Unnamed: 0,till,arrived,south,tired,challenged,clash,ha,ministry,remember,isnt,...,responding,sajeda,couldnt,baron,dahildahil,karl,controversy,bottle,lifetime,chaudhuri
2831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
X_train, X_test, y_train, y_test = token_df_train.loc[index_train], token_df_train.loc[index_val], y.loc[index_train], y.loc[index_val]
bayes_class.fit(X_train, y_train)
accuracy_score(y_test, bayes_class.predict(X_test))

0.8743545611015491

In [106]:
X_train, X_test, y_train, y_test = token_df_train.loc[index_train], token_df_train.loc[index_val], y.loc[index_train], y.loc[index_val]
logreg.fit(X_train, y_train)
accuracy_score(y_test, logreg.predict(X_test))

0.8640275387263339

In [107]:
print("Результат на тестовом наборе данных:")
print(f"Линейная регрессия - {accuracy_score(y.loc[index_test], logreg.predict(token_df_test))}")
print(f"Байес - {accuracy_score(y.loc[index_test], bayes_class.predict(token_df_test))}")

Результат на тестовом наборе данных:
Линейная регрессия - 0.40144478844169246
Байес - 0.36429308565531476


### Count

In [108]:
X_train, X_test, y_train, y_test = count_df_train.loc[index_train], count_df_train.loc[index_val], df.loc[index_train, 'class'], df.loc[index_val, 'class']

In [109]:
logreg.fit(X_train, y_train)
accuracy_score(y_test, logreg.predict(X_test))

0.8640275387263339

In [110]:
bayes_class.fit(X_train, y_train)
accuracy_score(y_test, bayes_class.predict(X_test))

0.8777969018932874

In [111]:
print("Результат на тестовом наборе данных:")
print(f"Линейная регрессия - {accuracy_score(y.loc[index_test], logreg.predict(count_df_test))}")
print(f"Байес - {accuracy_score(y.loc[index_test], bayes_class.predict(count_df_test))}")

Результат на тестовом наборе данных:
Линейная регрессия - 0.8648090815273478
Байес - 0.849329205366357


### TFIDF

In [112]:
X_train, X_test, y_train, y_test = tfidf_df_train.loc[index_train], tfidf_df_train.loc[index_val], df.loc[index_train, 'class'], df.loc[index_val, 'class']

In [113]:
bayes_class.fit(X_train, y_train)
accuracy_score(y_test, bayes_class.predict(X_test))

0.8726333907056799

In [114]:
logreg.fit(X_train, y_train)
accuracy_score(y_test, logreg.predict(X_test))

0.8657487091222031

In [115]:
print("Результат на тестовом наборе данных:")
print(f"Линейная регрессия - {accuracy_score(df.loc[index_test, 'class'], logreg.predict(tfidf_df_test))}")
print(f"Байес - {accuracy_score(df.loc[index_test, 'class'], bayes_class.predict(tfidf_df_test))}")

Результат на тестовом наборе данных:
Линейная регрессия - 0.8658410732714138
Байес - 0.8555211558307534


### COS_SIMYLARITY

In [116]:
M = cosine_similarity(pd.concat([tfidf_df_train, tfidf_df_test],axis=0))
cos_list = list()
for i, row in enumerate(M):
    for j, value in enumerate(row):
        if i != j:
            cos_list.append(((i, j), value))
cos_list.sort(key=lambda x: x[1],reverse=True)
for i, _ in cos_list[:20]:
    print(X[i[0]],'\nsimilar to\n', X[i[1]], '\n',sep='')

need cheering
similar to
say kanwal sibal

need cheering
similar to
bypassing highway ban

koala dying thirst u unhappy 9
similar to
mutual let talk moreme suremutual happy me9

koala dying thirst u unhappy 9
similar to
europe

never want finish desperate housewife life unhappy
similar to
popping store

popping store
similar to
never want finish desperate housewife life unhappy

mutual let talk moreme suremutual happy me9
similar to
koala dying thirst u unhappy 9

mutual let talk moreme suremutual happy me9
similar to
europe

say kanwal sibal
similar to
need cheering

say kanwal sibal
similar to
bypassing highway ban

bypassing highway ban
similar to
need cheering

bypassing highway ban
similar to
say kanwal sibal

europe
similar to
koala dying thirst u unhappy 9

europe
similar to
mutual let talk moreme suremutual happy me9

reunion august unhappy
similar to
reverse polarisation special epaper link

dont want tell fellow comm student dont want think im kind suckup unhappy
similar to
u

# STEMMING + MISSPELING

In [117]:
X, y = df['text'].copy(), df['class']

In [118]:
stemmer = PorterStemmer()

In [119]:
X = X.map(lambda x: ' '.join([stemmer.stem(i) for i in x.split()]))

In [120]:
X.shape

(3873,)

In [121]:
token_df_train = pd.Series(X.loc[idx]).str.get_dummies(sep=' ')

In [122]:
token_df_train.head()

Unnamed: 0,0,000,000019,014736,04,041017,0570,0700am,0845am,09,...,yoyour,yoyoy,yummi,yura,yuri,zac,zcc,zero,zoo,zplu
565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
216,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
823,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2815,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2758,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [123]:
token_df_test = pd.Series(X.loc[index_test]).str.get_dummies(sep=' ')

In [124]:
token_df_test = token_df_test.loc[:, set(token_df_train.columns).intersection(set(token_df_test))]

In [125]:
token_df_test.loc[:, set(token_df_train.columns) - set(token_df_train.columns).intersection(set(token_df_test.columns))] = 0

In [126]:
token_df_test

Unnamed: 0,till,south,invit,ha,clash,isnt,chain,bc,captain,friend,...,violat,sajeda,couldnt,baron,dahildahil,karl,sonobuoysps,winthi,flower,figur
2831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## CountVectorizer

In [127]:
count_vectorizer = CountVectorizer()

In [128]:
count_vectorizer.fit(X.loc[idx])

CountVectorizer()

In [129]:
count_df_train = pd.DataFrame(data = count_vectorizer.transform(X.loc[idx]).toarray(),
                              columns=count_vectorizer.get_feature_names(), 
                              index=idx
                             )

In [130]:
count_df_test = pd.DataFrame(data = count_vectorizer.transform(X.loc[index_test]).toarray(),
                             columns=count_vectorizer.get_feature_names(), 
                             index=index_test,
                            )

In [131]:
count_df_test

Unnamed: 0,000,000019,014736,04,041017,0570,0700am,0845am,09,0xx9,...,yoyour,yoyoy,yummi,yura,yuri,zac,zcc,zero,zoo,zplu
2831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TFIDFVectorizer

In [132]:
tfidf_vectorizer =  TfidfVectorizer()

In [133]:
tfidf_vectorizer.fit(X.loc[idx])

TfidfVectorizer()

In [134]:
tfidf_df_train = pd.DataFrame(data = tfidf_vectorizer.transform(X.loc[idx]).toarray(),
                              columns=tfidf_vectorizer.get_feature_names(), 
                              index=idx
                             )

In [135]:
tfidf_df_test = pd.DataFrame(data = tfidf_vectorizer.transform(X.loc[index_test]).toarray(),
                              columns=tfidf_vectorizer.get_feature_names(), 
                              index=index_test
                             )

In [136]:
tfidf_df_train.head()

Unnamed: 0,000,000019,014736,04,041017,0570,0700am,0845am,09,0xx9,...,yoyour,yoyoy,yummi,yura,yuri,zac,zcc,zero,zoo,zplu
565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Classification

In [137]:
bayes_class = MultinomialNB()
logreg = LogisticRegression() 

In [138]:
# index_train, index_val = map(lambda x: x.index ,train_test_split(y.loc[index_train], 
#                                                                  stratify=y.loc[index_train], 
#                                                                  test_size=0.2,
#                                                                 )
#                             )

### Token

In [139]:
token_df_test

Unnamed: 0,till,south,invit,ha,clash,isnt,chain,bc,captain,friend,...,violat,sajeda,couldnt,baron,dahildahil,karl,sonobuoysps,winthi,flower,figur
2831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [140]:
X_train, X_test, y_train, y_test = token_df_train.loc[index_train], token_df_train.loc[index_val], y.loc[index_train], y.loc[index_val]
bayes_class.fit(X_train, y_train)
accuracy_score(y_test, bayes_class.predict(X_test))

0.8623063683304647

In [141]:
X_train, X_test, y_train, y_test = token_df_train.loc[index_train], token_df_train.loc[index_val], y.loc[index_train], y.loc[index_val]
logreg.fit(X_train, y_train)
accuracy_score(y_test, logreg.predict(X_test))

0.8691910499139415

In [142]:
print("Результат на тестовом наборе данных:")
print(f"Линейная регрессия - {accuracy_score(y.loc[index_test], logreg.predict(token_df_test))}")
print(f"Байес - {accuracy_score(y.loc[index_test], bayes_class.predict(token_df_test))}")

Результат на тестовом наборе данных:
Линейная регрессия - 0.4066047471620227
Байес - 0.42105263157894735


### Count

In [143]:
X_train, X_test, y_train, y_test = count_df_train.loc[index_train], count_df_train.loc[index_val], df.loc[index_train, 'class'], df.loc[index_val, 'class']

In [144]:
logreg.fit(X_train, y_train)
accuracy_score(y_test, logreg.predict(X_test))

0.8709122203098106

In [145]:
bayes_class.fit(X_train, y_train)
accuracy_score(y_test, bayes_class.predict(X_test))

0.8605851979345955

In [146]:
print("Результат на тестовом наборе данных:")
print(f"Линейная регрессия - {accuracy_score(y.loc[index_test], logreg.predict(count_df_test))}")
print(f"Байес - {accuracy_score(y.loc[index_test], bayes_class.predict(count_df_test))}")

Результат на тестовом наборе данных:
Линейная регрессия - 0.8740970072239422
Байес - 0.8534571723426213


### TFIDF

In [147]:
X_train, X_test, y_train, y_test = tfidf_df_train.loc[index_train], tfidf_df_train.loc[index_val], df.loc[index_train, 'class'], df.loc[index_val, 'class']

In [148]:
bayes_class.fit(X_train, y_train)
accuracy_score(y_test, bayes_class.predict(X_test))

0.8588640275387264

In [149]:
logreg.fit(X_train, y_train)
accuracy_score(y_test, logreg.predict(X_test))

0.8640275387263339

In [150]:
print("Результат на тестовом наборе данных:")
print(f"Линейная регрессия - {accuracy_score(df.loc[index_test, 'class'], logreg.predict(tfidf_df_test))}")
print(f"Байес - {accuracy_score(df.loc[index_test, 'class'], bayes_class.predict(tfidf_df_test))}")

Результат на тестовом наборе данных:
Линейная регрессия - 0.8720330237358102
Байес - 0.8555211558307534


### COS_SIMYLARITY

In [151]:
M = cosine_similarity(pd.concat([tfidf_df_train, tfidf_df_test],axis=0))
cos_list = list()
for i, row in enumerate(M):
    for j, value in enumerate(row):
        if i != j:
            cos_list.append(((i, j), value))
cos_list.sort(key=lambda x: x[1],reverse=True)
for i, _ in cos_list[:20]:
    print(X[i[0]],'\nsimilar to\n', X[i[1]], '\n',sep='')

miss unhappi
similar to
make great point

word never miss articl happi
similar to
molten tar wipe famili

make great point
similar to
miss unhappi

molten tar wipe famili
similar to
word never miss articl happi

becoz depend promot wast hardwork team
similar to
regim chang bulli polic offic

rain hard unhappi
similar to
cute pictur though

yuri didnt know unhappi second snsd miss
similar to
groovi thursday happi

face swap cat dog realli upset unhappi 1
similar to
final next week unhappi wish luck x

realli amount load section unhappi ill ride netherton meet darbi end
similar to
call center nice game 420 unhappi

realli amount load section unhappi ill ride netherton meet darbi end
similar to
would never forget

reunion august unhappi
similar to
revers polaris special epap link

dont want tell fellow comm student dont want think im kind suckup unhappi
similar to
unhappi let fight kid

dont want tell fellow comm student dont want think im kind suckup unhappi
similar to
streaminuteg 45 mi

# LEMMING + MISSPELING

In [152]:
X, y = df['text'].copy(), df['class']

In [153]:
lemmer = WordNetLemmatizer()

In [154]:
X = X.map(lambda x: ' '.join([lemmer.lemmatize(i) for i in x.split()]))

In [155]:
token_df_train = pd.Series(X.loc[idx]).str.get_dummies(sep=' ')

In [156]:
token_df_train.head()

Unnamed: 0,0,000,000019,014736,04,041017,0570,0700am,0845am,09,...,yoyoure,yoyoyou,yummy,yura,yuri,zac,zcc,zero,zoo,zplus
565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
216,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
823,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2815,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2758,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [157]:
token_df_test = pd.Series(X.loc[index_test]).str.get_dummies(sep=' ')

In [158]:
token_df_test = token_df_test.loc[:, set(token_df_train.columns).intersection(set(token_df_test))]

In [159]:
token_df_test.loc[:, set(token_df_train.columns) - set(token_df_train.columns).intersection(set(token_df_test.columns))] = 0

In [160]:
token_df_test

Unnamed: 0,till,arrived,south,tired,challenged,clash,ha,ministry,remember,isnt,...,responding,sajeda,couldnt,baron,dahildahil,karl,controversy,bottle,lifetime,chaudhuri
2831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## CountVectorizer

In [161]:
count_vectorizer = CountVectorizer()

In [162]:
count_vectorizer.fit(X.loc[idx])

CountVectorizer()

In [163]:
count_df_train = pd.DataFrame(data = count_vectorizer.transform(X.loc[idx]).toarray(),
                              columns=count_vectorizer.get_feature_names(), 
                              index=idx
                             )

In [164]:
count_df_test = pd.DataFrame(data = count_vectorizer.transform(X.loc[index_test]).toarray(),
                             columns=count_vectorizer.get_feature_names(), 
                             index=index_test,
                            )

In [165]:
count_df_test

Unnamed: 0,000,000019,014736,04,041017,0570,0700am,0845am,09,0xx9,...,yoyoure,yoyoyou,yummy,yura,yuri,zac,zcc,zero,zoo,zplus
2831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TFIDFVectorizer

In [166]:
tfidf_vectorizer =  TfidfVectorizer()

In [167]:
tfidf_vectorizer.fit(X.loc[idx])

TfidfVectorizer()

In [168]:
tfidf_df_train = pd.DataFrame(data = tfidf_vectorizer.transform(X.loc[idx]).toarray(),
                              columns=tfidf_vectorizer.get_feature_names(), 
                              index=idx
                             )

In [169]:
tfidf_df_test = pd.DataFrame(data = tfidf_vectorizer.transform(X.loc[index_test]).toarray(),
                              columns=tfidf_vectorizer.get_feature_names(), 
                              index=index_test
                             )

In [170]:
tfidf_df_train.head()

Unnamed: 0,000,000019,014736,04,041017,0570,0700am,0845am,09,0xx9,...,yoyoure,yoyoyou,yummy,yura,yuri,zac,zcc,zero,zoo,zplus
565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Classification

In [171]:
bayes_class = MultinomialNB()
logreg = LogisticRegression() 

In [172]:
# index_train, index_val = map(lambda x: x.index ,train_test_split(y.loc[index_train], 
#                                                                  stratify=y.loc[index_train], 
#                                                                  test_size=0.2,
#                                                                 )
#                             )

### Token

In [173]:
token_df_test

Unnamed: 0,till,arrived,south,tired,challenged,clash,ha,ministry,remember,isnt,...,responding,sajeda,couldnt,baron,dahildahil,karl,controversy,bottle,lifetime,chaudhuri
2831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [174]:
X_train, X_test, y_train, y_test = token_df_train.loc[index_train], token_df_train.loc[index_val], y.loc[index_train], y.loc[index_val]
bayes_class.fit(X_train, y_train)
accuracy_score(y_test, bayes_class.predict(X_test))

0.8743545611015491

In [175]:
X_train, X_test, y_train, y_test = token_df_train.loc[index_train], token_df_train.loc[index_val], y.loc[index_train], y.loc[index_val]
logreg.fit(X_train, y_train)
accuracy_score(y_test, logreg.predict(X_test))

0.8640275387263339

In [176]:
print("Результат на тестовом наборе данных:")
print(f"Линейная регрессия - {accuracy_score(y.loc[index_test], logreg.predict(token_df_test))}")
print(f"Байес - {accuracy_score(y.loc[index_test], bayes_class.predict(token_df_test))}")

Результат на тестовом наборе данных:
Линейная регрессия - 0.40144478844169246
Байес - 0.36429308565531476


### Count

In [177]:
X_train, X_test, y_train, y_test = count_df_train.loc[index_train], count_df_train.loc[index_val], df.loc[index_train, 'class'], df.loc[index_val, 'class']

In [178]:
logreg.fit(X_train, y_train)
accuracy_score(y_test, logreg.predict(X_test))

0.8640275387263339

In [179]:
bayes_class.fit(X_train, y_train)
accuracy_score(y_test, bayes_class.predict(X_test))

0.8777969018932874

In [180]:
print("Результат на тестовом наборе данных:")
print(f"Линейная регрессия - {accuracy_score(y.loc[index_test], logreg.predict(count_df_test))}")
print(f"Байес - {accuracy_score(y.loc[index_test], bayes_class.predict(count_df_test))}")

Результат на тестовом наборе данных:
Линейная регрессия - 0.8648090815273478
Байес - 0.849329205366357


### TFIDF

In [181]:
X_train, X_test, y_train, y_test = tfidf_df_train.loc[index_train], tfidf_df_train.loc[index_val], df.loc[index_train, 'class'], df.loc[index_val, 'class']

In [182]:
bayes_class.fit(X_train, y_train)
accuracy_score(y_test, bayes_class.predict(X_test))

0.8726333907056799

In [183]:
logreg.fit(X_train, y_train)
accuracy_score(y_test, logreg.predict(X_test))

0.8657487091222031

In [184]:
print("Результат на тестовом наборе данных:")
print(f"Линейная регрессия - {accuracy_score(df.loc[index_test, 'class'], logreg.predict(tfidf_df_test))}")
print(f"Байес - {accuracy_score(df.loc[index_test, 'class'], bayes_class.predict(tfidf_df_test))}")

Результат на тестовом наборе данных:
Линейная регрессия - 0.8658410732714138
Байес - 0.8555211558307534


In [185]:
M = cosine_similarity(pd.concat([tfidf_df_train, tfidf_df_test],axis=0))
cos_list = list()
for i, row in enumerate(M):
    for j, value in enumerate(row):
        if i != j:
            cos_list.append(((i, j), value))
cos_list.sort(key=lambda x: x[1],reverse=True)
for i, _ in cos_list[:20]:
    print(X[i[0]],'\nsimilar to\n', X[i[1]], '\n',sep='')

need cheering
similar to
say kanwal sibal

need cheering
similar to
bypassing highway ban

koala dying thirst u unhappy 9
similar to
mutual let talk moreme suremutual happy me9

koala dying thirst u unhappy 9
similar to
europe

never want finish desperate housewife life unhappy
similar to
popping store

popping store
similar to
never want finish desperate housewife life unhappy

mutual let talk moreme suremutual happy me9
similar to
koala dying thirst u unhappy 9

mutual let talk moreme suremutual happy me9
similar to
europe

say kanwal sibal
similar to
need cheering

say kanwal sibal
similar to
bypassing highway ban

bypassing highway ban
similar to
need cheering

bypassing highway ban
similar to
say kanwal sibal

europe
similar to
koala dying thirst u unhappy 9

europe
similar to
mutual let talk moreme suremutual happy me9

reunion august unhappy
similar to
reverse polarisation special epaper link

dont want tell fellow comm student dont want think im kind suckup unhappy
similar to
u