# Exploring how artificial intelligence technologies could be leveraged to combat fake news.
# http://www.fakenewschallenge.org/

In [1]:
import pandas as pd
import numpy as np
import string
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from nltk.stem import PorterStemmer # stemmer
from nltk.stem.snowball import SnowballStemmer # inny steemer przekształca runnin na run 
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from sklearn.pipeline import FeatureUnion
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion
from imblearn.over_sampling import SMOTE
from scipy.spatial.distance import cosine


In [2]:
bodies = pd.read_csv(r"C:\Users\Tomasz\PycharmProjects\train_bodies.csv")
print(bodies.shape)
bodies.head(2)

(1683, 2)


Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...


In [3]:
headlines = pd.read_csv(r'C:\Users\Tomasz\PycharmProjects\train_stances.csv')
print(headlines.shape)
headlines.head(2)

(49972, 3)


Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree


## first attempt
## join sets

In [4]:
df = pd.merge(headlines, bodies, left_on='Body ID', right_on='Body ID')

In [5]:
df.head(2)

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...
1,Seth Rogen to Play Apple’s Steve Wozniak,712,discuss,Danny Boyle is directing the untitled film\n\n...


In [6]:
df.shape

(49972, 4)

In [7]:
df['text'] = df['Headline'] + ' ' + df['articleBody']
df.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody,text
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,Police find mass graves with at least '15 bodi...
1,Seth Rogen to Play Apple’s Steve Wozniak,712,discuss,Danny Boyle is directing the untitled film\n\n...,Seth Rogen to Play Apple’s Steve Wozniak Danny...
2,Mexico police find mass grave near site 43 stu...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,Mexico police find mass grave near site 43 stu...
3,Mexico Says Missing Students Not Found In Firs...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,Mexico Says Missing Students Not Found In Firs...
4,New iOS 8 bug can delete all of your iCloud do...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,New iOS 8 bug can delete all of your iCloud do...


In [8]:
stance={'unrelated':1,'discuss':2,'agree':3,'disagree':4}
df.Stance = [stance[item] for item in df.Stance]
#df.Stance.value_counts()

In [9]:
y = df.Stance
X = df.text

In [10]:
y.value_counts() # non balanced set

1    36545
2     8909
3     3678
4      840
Name: Stance, dtype: int64

## division ,train, test

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state = 42
)

In [12]:
# only take the test to check !!!
# how looks the set ?

In [13]:
X_train.shape # (39977,)
y_train.shape # (39977,)

#X_train.head()

(39977,)

In [14]:
columnname = {'text': X_train, 'Stance': y_train}
dfpolaczony = pd.DataFrame(data=columnname)



In [15]:
dfpolaczony.head()

Unnamed: 0,text,Stance
45131,Christian Bale In Talks To Play Steve Jobs In ...,2
48011,Her Majesty’s magic mushrooms Investment firm ...,1
26530,Dog found abandoned outside railway station wi...,3
30200,Angry mob hacks off alleged rapist's genitals ...,1
7195,Nigeria Boko Haram blamed for raids despite tr...,1


# Undersamling , Oversampling

In [16]:
df_balanced_undersampling = pd.concat(
    [
        dfpolaczony[dfpolaczony['Stance'] == 1].sample(n=680, random_state=42, replace=True),
        dfpolaczony[dfpolaczony['Stance'] == 2].sample(n=680, random_state=42, replace=True),
        dfpolaczony[dfpolaczony['Stance'] == 3].sample(n=680, random_state=42, replace=True),
        dfpolaczony[dfpolaczony['Stance'] == 4]       
    ]
)

In [17]:
df_balanced_oversampling = pd.concat(
    [
        dfpolaczony[dfpolaczony['Stance'] == 1],
        dfpolaczony[dfpolaczony['Stance'] == 2].sample(n=29305, random_state=42, replace=True),
        dfpolaczony[dfpolaczony['Stance'] == 3].sample(n=29305, random_state=42, replace=True),
        dfpolaczony[dfpolaczony['Stance'] == 4].sample(n=29305, random_state=42, replace=True)       
    ]
)

In [18]:
df_balanced_oversampling.Stance.value_counts() # 29305 * 4 ok
df_balanced_oversampling.head(2)

Unnamed: 0,text,Stance
48011,Her Majesty’s magic mushrooms Investment firm ...,1
30200,Angry mob hacks off alleged rapist's genitals ...,1


In [19]:
df_balanced_undersampling.Stance.value_counts() # 680 * 4 ok

3    680
1    680
4    680
2    680
Name: Stance, dtype: int64

In [20]:
X_train_under = df_balanced_undersampling.text
y_train_under = df_balanced_undersampling.Stance
X_train_under.shape
#print(680 * 4) # 2720 - ok

(2720,)

In [21]:
X_train_over = df_balanced_oversampling.text
y_train_over = df_balanced_oversampling.Stance
#print(29305 * 4) # 117220 - ok
# y_train_ove.shape # 117220 - ok

In [22]:
X_train_over.index_old = X_train_over.index # old index
y_train_over.index_old = y_train_over.index # old index
X_train_over.index = [i for i in range(0,len(X_train_over))] # new index
y_train_over.index = [i for i in range(0,len(X_train_over))] # new index
#X_train_over.tail()

# Logistic regression oversampling , undersampling

In [23]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression())
])



In [24]:
pipe.fit(X_train_over, y_train_over)

#These were the best combination of tuning parameters discovered
##best_params = {'tfidf__max_features': None, 'tfidf__use_idf': False,
##               'tfidf__smooth_idf': False, 'tfidf__ngram_range': (1, 2),
##               'tfidf__max_df': 1.0, 'tfidf__stop_words':



Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [25]:
accuracy_score( pipe.predict(X_test), y_test) # score train_over 0.6477238619309654

0.6477238619309654

In [26]:
pipe.fit(X_train_under, y_train_under)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [27]:
accuracy_score(pipe.predict(X_test), y_test) # score train_under 0.463631815907954

0.463631815907954

In [28]:
pipe4 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', LogisticRegression())
                    ])

paramrf4 = {
              'tfidf__ngram_range': [(1, 2)],
              #'tfidf__use_idf': (True, False),
              #'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
              #'tfidf__max_features': [10, 50, 100, 250, 500, 1000, None],
              #'tfidf__smooth_idf': (True, False),
              'tfidf__norm': ('l2',None),
              'rf__penalty':('l2','l1'),
              'rf__C':[0.1, 1.0, 10]
              }

In [29]:
grid4 = GridSearchCV(pipe4, paramrf4,cv=5)
grid4.fit(X_train_under, y_train_under)
accuracy_score(grid4.predict(X_test), y_test) # 0.5551775887943972 is better result than 0.463631815907954



0.5551775887943972

In [30]:
grid4.best_params_   # {'rf__C': 1.0, 'rf__penalty': 'l2','tfidf__ngram_range': (1, 2),'tfidf__norm': None}

{'rf__C': 1.0,
 'rf__penalty': 'l2',
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': None}

# Random Forest Undersampling Oversampling

In [31]:
from sklearn.ensemble import RandomForestClassifier

piperf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
                    ])

paramrf = {
              #'tfidf__ngram_range': [(1, 1), (1, 2)],
              #'tfidf__use_idf': (True, False),
              #'tfidf__max_df': [0.25, 1.0],
              #'tfidf__max_features': [10, 100, 1000, None],
              #'tfidf__smooth_idf': (True, False),
              #'rf__n_estimators': [200, 500],
              #'rf__max_features': ['sqrt', 'log2'],
              #'rf__max_depth' : [4,6,8],
              #'rf__criterion' :['gini', 'entropy']
              }

In [32]:
gridrf = GridSearchCV(piperf, paramrf,cv=5)
gridrf.fit(X_train_under, y_train_under)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [33]:
accuracy_score(gridrf.predict(X_test), y_test) # 0.6050025012506253

0.6050025012506253

In [34]:
gridrf.best_estimator_

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [35]:
confusion_matrix ( y_test , gridrf.predict(X_test))

array([[4437, 1292, 1098,  413],
       [ 395, 1062,  226,  126],
       [ 129,   77,  437,  143],
       [   5,   11,   33,  111]], dtype=int64)

In [36]:
piperf1 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
                    ])


In [37]:
piperf1.fit(X_train_over, y_train_over)



Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [38]:
accuracy_score(piperf1.predict(X_test), y_test) # 0.8952476238119059 

0.8997498749374687

In [39]:
confusion_matrix ( y_test , piperf1.predict(X_test))

array([[6957,  203,   67,   13],
       [ 175, 1470,  134,   30],
       [  84,  143,  508,   51],
       [  15,   28,   59,   58]], dtype=int64)

In [40]:
piperf6 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('TrnSVD' , TruncatedSVD()),
    ('rf', RandomForestClassifier())
                    ])

paramrf6 = {
              'tfidf__ngram_range': [(1, 1), (1, 2)],
              #'tfidf__use_idf': (True, False),
              #'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
              'tfidf__max_features': [10],
              #'tfidf__smooth_idf': (True, False),
              'TrnSVD__n_components':[8],
              #'tfidf__norm': ('l1', 'l2', None),
              #'rf__n_estimators': [200, 500],
              #'rf__max_features': ['sqrt', 'log2'],
              #'rf__max_depth' : [4,6],
              #'rf__criterion' :['gini', 'entropy']
              }

In [41]:
gridrf6 = GridSearchCV(piperf6, paramrf6,cv=5) # cv = 5
gridrf6.fit(X_train_over, y_train_over)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'tfidf__ngram_range': [(1, 1), (1, 2)], 'tfidf__max_features': [10], 'TrnSVD__n_components': [8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [42]:
accuracy_score(gridrf6.predict(X_test), y_test) # 0.7157578789394697

0.7165582791395698

In [43]:
gridrf6.best_params_

{'TrnSVD__n_components': 8,
 'tfidf__max_features': 10,
 'tfidf__ngram_range': (1, 2)}

In [44]:
confusion_matrix ( y_test , gridrf6.predict(X_test))

array([[5594,  906,  534,  206],
       [ 476, 1135,  142,   56],
       [ 257,   94,  373,   62],
       [  49,   20,   31,   60]], dtype=int64)

In [45]:
import seaborn as sns 
import matplotlib.pyplot as plt 
cm =confusion_matrix( y_test , gridrf6.predict(X_test) ) 
index = ['1','2','3','4'] 
columns = ['1','2','3','4'] 
cm_df = pd.DataFrame(cm,columns,index) 
plt.figure(figsize=(10,6)) 
sns.heatmap(cm_df, annot=True)

<matplotlib.axes._subplots.AxesSubplot at 0x21a80faeb00>

## second attempt
 do not combine two collections
 separately columns Headline , articleBody

In [46]:
df.head(2)

Unnamed: 0,Headline,Body ID,Stance,articleBody,text
0,Police find mass graves with at least '15 bodi...,712,1,Danny Boyle is directing the untitled film\n\n...,Police find mass graves with at least '15 bodi...
1,Seth Rogen to Play Apple’s Steve Wozniak,712,2,Danny Boyle is directing the untitled film\n\n...,Seth Rogen to Play Apple’s Steve Wozniak Danny...


In [47]:
X=df[['Headline','articleBody']]
y=df['Stance']
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state = 42
)

In [48]:
X_train.shape

(39977, 2)

In [49]:
y_train.shape

(39977,)

In [50]:
# generate data
trainzloczenie = X_train
trainzloczenie['Stance']=y_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [51]:
trainzloczenie.shape


(39977, 3)

In [52]:
trainzloczenie.Stance.value_counts()

1    29305
2     7100
3     2892
4      680
Name: Stance, dtype: int64

# Undersamling , Oversampling

In [53]:
df_balanced_oversampling = pd.concat(
    [
        trainzloczenie[trainzloczenie['Stance'] == 1],
        trainzloczenie[trainzloczenie['Stance'] == 2].sample(n=29305, random_state=42, replace=True),
        trainzloczenie[trainzloczenie['Stance'] == 3].sample(n=29305, random_state=42, replace=True),
        trainzloczenie[trainzloczenie['Stance'] == 4].sample(n=29305, random_state=42, replace=True)       
    ]
)

In [54]:
df_balanced_undersampling = pd.concat(
    [
        trainzloczenie[trainzloczenie['Stance'] == 1].sample(n=680, random_state=42, replace=True),
        trainzloczenie[trainzloczenie['Stance'] == 2].sample(n=680, random_state=42, replace=True),
        trainzloczenie[trainzloczenie['Stance'] == 3].sample(n=680, random_state=42, replace=True),
        trainzloczenie[trainzloczenie['Stance'] == 4]          
    ]
)

In [55]:
df_balanced_oversampling.Stance.value_counts()

4    29305
3    29305
2    29305
1    29305
Name: Stance, dtype: int64

In [56]:
df_balanced_undersampling.Stance.value_counts()

3    680
1    680
4    680
2    680
Name: Stance, dtype: int64

In [57]:
df_balanced_undersampling.head(2)

Unnamed: 0,Headline,articleBody,Stance
36307,Kim Jong ill? Analysts say 27-year-old sister ...,(DETROIT) In a decision that’s expected to sen...,1
6486,Reports: Boko Haram may release schoolgirls as...,Young North Korean dictator Kim Jong Un’s heal...,1


In [58]:
X_train_under = df_balanced_undersampling[['Headline','articleBody']]
y_train_under = df_balanced_undersampling.Stance

In [59]:
X_train_over = df_balanced_oversampling[['Headline','articleBody']]
y_train_over = df_balanced_oversampling.Stance

In [60]:
X_train_over.index_old = X_train_over.index # old index
y_train_over.index_old = y_train_over.index # old index
X_train_over.index = [i for i in range(0,len(X_train_over))] # new index
y_train_over.index = [i for i in range(0,len(X_train_over))] # new index

  """Entry point for launching an IPython kernel.


In [61]:
X_train_over.head(2)

Unnamed: 0,Headline,articleBody
0,Her Majesty’s magic mushrooms,Investment firm Piper Jaffray issued a report ...
1,Angry mob hacks off alleged rapist's genitals ...,British man suspected of appearing in videos o...


In [65]:
vect = TfidfVectorizer(max_features = 50, stop_words="english",lowercase=False)
X_train_under_Head = vect.fit_transform(X_train_under.Headline).toarray()
X_train_under_article = vect.fit_transform(X_train_under.articleBody).toarray()

In [67]:
X_train_under_Head.shape,X_train_under_article.shape # ok

((2720, 50), (2720, 50))

In [68]:
# create function 
from numpy import dot
from numpy.linalg import norm

def cos_sim(t1,t2):
    return dot(t1, t2)/(norm(t1)*norm(t2))


In [69]:
cos=[]
for el in range(0,len(X_train_under_Head)):
    cos.insert(el,(cos_sim(X_train_under_Head[el] , X_train_under_article[el])))

  


In [70]:
len(cos) #2720 

2720

In [71]:
Head = pd.DataFrame(X_train_under_Head)
article = pd.DataFrame(X_train_under_article)
cos = pd.DataFrame(cos)

# replace np.nan to 0
cos=cos.fillna(0)

# zip DataFram
Head_cos_suma=np.concatenate((Head, cos,article),axis=1)
Head_cos_suma.shape #(2720, 17) => 8+1+8 ok

X_train_under =Head_cos_suma


## transform the set X_test to count the confusion_matrix

In [72]:
X_test.head(2)

Unnamed: 0,Headline,articleBody
17851,"Apple Watch to Be Shower-Proof, Have 100,000 A...","PALM BEACH GARDENS, Fla. -- A journeyman profe..."
6400,Weatherman caught peeing live on camera,Young North Korean dictator Kim Jong Un’s heal...


In [73]:
Headline_test = X_test.Headline
articleBody_test = X_test.articleBody

In [75]:
Headline_test.index_old = Headline_test.index # stary index
articleBody_test.index_old = articleBody_test.index # stary index
Headline_test.index = [i for i in range(0,len(Headline_test))] # nowy index
articleBody_test.index = [i for i in range(0,len(articleBody_test))] # nowy index

In [76]:
cos_test =[]

#vect = TfidfVectorizer(max_features = 8, stop_words="english",lowercase=False)
Headline_test = vect.fit_transform(Headline_test).toarray()
article_test = vect.fit_transform(articleBody_test).toarray()

for el in range(0,len(Headline_test)):
    cos_test.insert(el,(cos_sim(Headline_test[el] , article_test[el])))

  


In [77]:
#len(cos_test) #9995

In [78]:
Head_test = pd.DataFrame(Headline_test) # (9995, 8)
article_test = pd.DataFrame(article_test) # (9995, 8)
cos_test = pd.DataFrame(cos_test) # (9995, 1)

cos_test=cos_test.replace(np.nan, 0) #nan -> 0
#cos_test.shape # (9995, 1)

In [79]:
# combine DataFrams
Head_cos_article_test=np.concatenate((Head_test, cos_test,article_test),axis=1)
#Head_cos_article_test.shape #(9995, 17) => 8+1+8 ok
#X_test_Head_cos_article =Head_cos_article_test

In [80]:
X_test=Head_cos_article_test

In [81]:
piperfunder = Pipeline([
    ('rf', RandomForestClassifier())
                    ])

paramrfunder = { 
              'rf__n_estimators': [200, 500],
              'rf__max_features': ['sqrt', 'log2'],
              'rf__max_depth' : [5,10,50],
              'rf__criterion' :['gini', 'entropy']
              }

In [82]:
gridrfunder = GridSearchCV(piperfunder, paramrfunder,cv=5) # cv =5
gridrfunder.fit(X_train_under, y_train_under)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'rf__n_estimators': [200, 500], 'rf__max_features': ['sqrt', 'log2'], 'rf__max_depth': [5, 10, 50], 'rf__criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [83]:
accuracy_score(gridrfunder.predict(X_test), y_test)

0.37778889444722363

In [84]:
confusion_matrix ( y_test , gridrfunder.predict(X_test))

array([[2899, 2345, 1573,  423],
       [ 854,  621,  246,   88],
       [ 236,  252,  245,   53],
       [  39,   57,   53,   11]], dtype=int64)

In [None]:
# SUMMARY
# FIRST ATTEMPT
# MODELS                      SET             VALUES
# Logistic regression       train under    0.463631815907954
# Logistic regression       train over     0.5551775887943972
# Random Forest             train under    0.6050025012506253
# Random Forest             train over     0.8997498749374687 # best model

# SECOND ATTEMPT
# MODELS                      SET             VALUES
# Random Forest             train under    0.37778889444722363

In [85]:
# best matrix
# array([[6928,  220,   77,   15],
#        [ 183, 1463,   33,  130],
#        [  91,  149,  499,   47],
#        [  19,   24,   59,   58]], dtype=int64)

In [86]:
# Best model
# {'TrnSVD__n_components': 8,
#  'tfidf__max_features': 10,
#  'tfidf__ngram_range': (1, 1)}

In [87]:
# gridrf.best_estimator_

In [88]:
# Pipeline(memory=None,
#      steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
#         dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
#         lowercase=True, max_df=1.0, max_features=None, min_df=1,
#         ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...obs=None,
#             oob_score=False, random_state=None, verbose=0,
#             warm_start=False))])