In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! pip install wordcloud
! pip install nltk
from fastai.text.all import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from wordcloud import WordCloud
import re
import nltk
import string
from nltk import WordNetLemmatizer
from nltk.stem import PorterStemmer 
from nltk.corpus import stopwords

In [None]:
df_train = pd.read_csv('../input/nlp-getting-started/train.csv')
df_test = pd.read_csv("../input/nlp-getting-started/test.csv")

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()


# citation for text_preproc() https://towardsdatascience.com/cleaning-text-data-with-python-b69b47b97b76
# I just added lemmatization
def text_preproc(x):
    x = x.lower()
    x = ' '.join([lemmatizer.lemmatize(word) for word in x.split(' ') if word not in stop_words])
    x = x.encode('ascii', 'ignore').decode()
    x = re.sub(r'https*\S+', ' ', x)
    x = re.sub(r'@', ' ', x)
    x = re.sub(r'amp', '', x) # noise discovered from word clouds
    x = re.sub(r'new', '', x)
    
    x = re.sub(r' s ', '', x)
    x = re.sub(r'#', ' ', x)
    x = re.sub(r'\'\w+', '', x)
    x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
    x = re.sub(r'\w*\d+\w*', '', x)
    x = re.sub(r'\s{2,}', ' ', x)
    return x

In [None]:
df_train['text'] = df_train['text'].apply(lambda x: text_preproc(x))

Now that we've cleaned the text a little, lets take a look at the word distributions with word clouds

In [None]:
dis = ''
fake = ''
for i, obj in df_train.iterrows():
    if obj['target'] == 1:
        dis += obj["text"] + " "
    elif obj['target'] == 0:
        fake += obj['text'] + " "

In [None]:
wordcloud_real = WordCloud().generate(dis) 
plt.imshow(wordcloud_real)

In [None]:
wordcloud_fake = WordCloud().generate(fake) 
plt.imshow(wordcloud_fake)

Now lets build a model

In [None]:
vectorizer = TfidfVectorizer()

First we have to vectorize our words. Ill be using Tfidf (https://en.wikipedia.org/wiki/Tf%E2%80%93idf)

In [None]:
training_set = df_train.sample(frac = 0.8) 
test_set = df_train.drop(training_set.index) 
X = vectorizer.fit_transform(training_set["text"])
test_X = vectorizer.transform(test_set["text"])

Now lets build an ensemble model

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC

svc = LinearSVC()
mlp = MLPClassifier(max_iter=3000)
rf_clf = RandomForestClassifier(n_estimators=200, random_state=0, bootstrap=True)

v_clf = VotingClassifier(estimators=[('rf',rf_clf), ('svc', svc), ('mlp', mlp)], voting="hard")

v_clf.fit(X, training_set['target'])

prediction = v_clf.predict(test_X)

### Sklearn ensemble Results:

In [None]:
print(classification_report(test_set["target"], prediction))

Roughly 79% accuracy with this model. Now for a fastai pretrained model with ULMfit

In [None]:
df_lm = pd.read_csv('../input/nlp-getting-started/train.csv') # not using preprocessing applied in sklearn ensemble.

In [None]:
def pre_simp(x):
    x = re.sub(r'amp', '', x)
    return x

Lets train two models using language models to pretrain them. First lets make a classifier pretrained on a backwards language model of the tweets

In [None]:
dls = TextDataLoaders.from_df(df_lm, text_col='text', is_lm=True, valid_pct=.1, backwards=True)
dls.show_batch(max_n=3)

In [None]:
learn = language_model_learner(dls, AWD_LSTM, pretrained=True, drop_mult=0.5, metrics=[accuracy, perplexity])

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(3, 1e-1)

In [None]:
learn.unfreeze() # Run for many epochs with all layers unfrozen
learn.fit_one_cycle(8, slice(1e-5,2e-2))

In [None]:
learn.save_encoder('finetuned')

Now it can generate backwards tweets

In [None]:
TEXT = "A horrible"
N_WORDS = 20
N_SENTENCES = 2
preds = [learn.predict(TEXT, N_WORDS, temperature=0.75) 
         for _ in range(N_SENTENCES)]

print("\n".join(preds))

Now lets use this pretrained model for a classifier

In [None]:
train_X = pd.read_csv('../input/nlp-getting-started/train.csv')
dls_clas = TextDataLoaders.from_df(train_X, text_col='text', label_col='target', backwards=True, vocab=dls.vocab)
dls_clas.show_batch(max_n=3)

In [None]:
learn = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5, metrics=accuracy).to_fp16()

In [None]:
learn = learn.load_encoder('finetuned')

learn.lr_find()

In [None]:
learn.fit_one_cycle(3,1e-2)

In [None]:
learn.unfreeze()
learn.lr_find()

In [None]:
learn.fit_one_cycle(3, slice(2e-5,3e-3))

Now lets do the same thing without a backwards language model

In [None]:
df_lm = pd.read_csv('../input/nlp-getting-started/train.csv')

In [None]:
#df_lm['text'] = df_lm['text'].apply(lambda x: prep_simp(x))

dls_f = TextDataLoaders.from_df(df_lm, text_col='text', is_lm=True, backwards=False, valid_pct=.1)
dls.show_batch(max_n=3)

In [None]:
learn_f = language_model_learner(dls_f, AWD_LSTM, pretrained=True, drop_mult=0.5, metrics=[accuracy, perplexity]).to_fp16()

In [None]:
learn = learn.load_encoder('finetuned')
learn_f.lr_find()

In [None]:
learn_f.fit_one_cycle(3, 2e-1)

In [None]:
learn_f.unfreeze()
learn_f.lr_find()

In [None]:
learn_f.fit_one_cycle(8, slice(1e-6,3e-3))

In [None]:
learn_f.save_encoder('finetuned')

In [None]:
train_X = pd.read_csv('../input/nlp-getting-started/train.csv')
#train_X['text'] = train_X['text'].apply(lambda x: preproc(x))
dls_clas_f = TextDataLoaders.from_df(train_X, text_col='text', label_col='target',backwards=False, vocab=dls_f.vocab)
dls_clas_f.show_batch(max_n=3)

In [None]:
learn_f = text_classifier_learner(dls_clas_f, AWD_LSTM, drop_mult=0.5, metrics=accuracy).to_fp16()
learn_f = learn_f.load_encoder('finetuned')
learn_f.lr_find()

In [None]:
learn_f.fit_one_cycle(3,2e-2)

In [None]:
learn_f.unfreeze()
learn_f.lr_find()

In [None]:
learn_f.fit_one_cycle(6, slice(1e-5,2e-3))

Function below used to make ensemble predictions

In [None]:


#ensemble the forward and backward predictions
def ens_predict(obj):
    f_pred = learn_f.predict(obj)[2]
    b_pred = learn.predict(obj)[2]
    vec = f_pred + b_pred
    if vec[0] > vec[1]:
        return 0
    else:
        return 1