# Check (That Tweet) Yo Self 
## Prioritizing Tweets to Fact Check
###### Part 5A: Linear Model (Supervised)

Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import time
import warnings
import regex as re
import seaborn as sns
import re
import statistics

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.linear_model import Ridge 
from sklearn.linear_model import Lasso 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from nltk.sentiment.vader import SentimentIntensityAnalyzer
warnings.filterwarnings('ignore')
np.random.seed(824)
from bs4 import BeautifulSoup 

# Import stopwords.
from nltk.corpus import stopwords # Import the stopword list
import nltk

from tweetscrape.users_scrape import TweetScrapperUser

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.style.use('fivethirtyeight')

Loading our data that now also contains user info:

In [2]:
tweet = pd.read_csv('../data/model_attempt_tweets.csv')

In [3]:
cvec = CountVectorizer(stop_words = 'english', min_df=1, max_df=0.25, ngram_range=(1, 3))




In [4]:
cvec_matrix = cvec.fit_transform(tweet['text'])

In [5]:
# Convert X_train into a DataFrame.

cvec_df = pd.DataFrame(cvec_matrix.toarray(),
                          columns=cvec.get_feature_names())
cvec_df

Unnamed: 0,00,00 000,00 000 001,00 april,00 april 25,00 en,00 en bb138m2i,00 et,00 et hear,00 et register,...,𝘚𝘩𝘢𝘳𝘪𝘯𝘨 𝘈𝘪𝘳 new,𝘪𝘯𝘴𝘪𝘥𝘦,𝘪𝘯𝘴𝘪𝘥𝘦 body,𝘪𝘯𝘴𝘪𝘥𝘦 body looked,𝘰𝘶𝘵𝘴𝘪𝘥𝘦,𝘰𝘶𝘵𝘴𝘪𝘥𝘦 body,𝘰𝘶𝘵𝘴𝘪𝘥𝘦 body minute,𝘵𝘩𝘢𝘵,𝘵𝘩𝘢𝘵 kill,𝘵𝘩𝘢𝘵 kill virus
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30114,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30115,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30116,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30117,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
cvec_df.columns

Index(['00', '00 000', '00 000 001', '00 april', '00 april 25', '00 en',
       '00 en bb138m2i', '00 et', '00 et hear', '00 et register',
       ...
       '𝘚𝘩𝘢𝘳𝘪𝘯𝘨 𝘈𝘪𝘳 new', '𝘪𝘯𝘴𝘪𝘥𝘦', '𝘪𝘯𝘴𝘪𝘥𝘦 body', '𝘪𝘯𝘴𝘪𝘥𝘦 body looked',
       '𝘰𝘶𝘵𝘴𝘪𝘥𝘦', '𝘰𝘶𝘵𝘴𝘪𝘥𝘦 body', '𝘰𝘶𝘵𝘴𝘪𝘥𝘦 body minute', '𝘵𝘩𝘢𝘵', '𝘵𝘩𝘢𝘵 kill',
       '𝘵𝘩𝘢𝘵 kill virus'],
      dtype='object', length=638196)

These were some of the top phrases from vectorizing:

In [7]:
phrases = cvec_df[['hi er doc',
                   'er doc inject',
                   'hi er',
                   'doc inject consume',
                  'er doc',
                  'doc inject',
                  'consume disinfectants attempt',
                  'disinfectants attempt kill',
                  'attempt kill covid19',
                  'disinfectants attempt',
                  'attempt kill',
                  'doc',
                  'kill covid19',
                  'er',
                  'attempt',
                  'hi',
                  'hard lies',
                  'hard lies social',
                  'social isolation work']]

In [9]:
phrases

Unnamed: 0,hi er doc,er doc inject,hi er,doc inject consume,er doc,doc inject,consume disinfectants attempt,disinfectants attempt kill,attempt kill covid19,disinfectants attempt,attempt kill,doc,kill covid19,er,attempt,hi,hard lies,hard lies social,social isolation work
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30114,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30115,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30116,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30117,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
X = phrases

Engineering a new interaction term between "big feelings" and "follow ratio":

In [11]:
X['big_feelings_follow_ratio'] = tweet['big_feelings'] * tweet['follow_ratio']

In [12]:
y = tweet['target']

Trying another linear regression:

In [13]:
# Create train_test_split.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                   random_state = 42)

In [14]:
linear_reg = LinearRegression()

In [15]:
linear_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
#R2 scores
print (f'Linear Regression Training R2 Score: {round(linear_reg.score(X_train, y_train), 4)}.')
print (f'Linear Regression Testing R2 Score: {round(linear_reg.score(X_test, y_test), 4)}.')

Training R2 Score: 0.9133.
Testing R2 Score: -0.0003.


Unfortunately, even with many new features, the we are not having much success with regression to predict the target. In the next notebook we'll try a few more regression methods