# Check (That Tweet) Yo Self 
## Prioritizing Tweets to Fact Check
###### Part 5B: Linear Model (Supervised)

Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import BaggingRegressor
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer

Load in the data

In [2]:
tweet = pd.read_csv('../data/final_random_tweet.csv')

In [3]:
tweet.head(2)

Unnamed: 0,id,time,author,author_id,associated_tweet,text,links,hashtags,mentions,reply_count,...,clean_text,clean_word_count,clean_char_count,user_bio,user_location,user_url,user_tweets,user_following,user_followers,user_favorites
0,1254190074595553281,2020-04-25 16:26:30,Iam_helenna,215204985,1254190074595553281,"Today, we have 1182 cases in Nigeria with 35 d...",[],[''],[''],37,...,today cases nigeria deaths discharged isolatio...,28,193,I’m so F**king Rare /////WifeBusiness Entrepre...,,,26706,8566,11629,28266.0
1,1253828209075990531,2020-04-24 16:28:34,KerryeHill,2807727004,1253697753479331840,There's no such thing as a medical disinfectan...,[],[''],[''],1,...,thing medical disinfectant use pulmonary syste...,19,152,"Native to Florida. Single, no kids. Love to s...",,,2626,211,51,5758.0


In [4]:
tweet = tweet[:33200]

In [5]:
tweet.shape

(30300, 31)

Adding some of the features we previously engineered back in:

In [6]:
tweet['len_user'] = [len(x) for x in tweet['author']]

In [7]:
def per_upper(string):
    count = 0
    for s in string:
         if s == s.upper():
                count += 1
    ratio = count / len(string)
    return ratio

In [8]:
tweet['big_feelings'] = tweet['text'].apply(per_upper)

In [9]:
def get_ratio(followers, following):
    if following == 0:
        following = 1
    elif followers == 0:
        return 0
    else:
        return followers / following

In [10]:
tweet['ratio'] = [get_ratio(m, n) for m, n in zip(tweet['user_followers'], tweet['user_following'])]

In [11]:
tweet['has_url'] = tweet['user_url'].notna().astype(int)

In [12]:
tweet['has_url'].value_counts()

0    21382
1     8918
Name: has_url, dtype: int64

In [13]:
tweet['has_location'] = tweet['user_location'].notna().astype(int)

In [14]:
tweet['has_bio'] = tweet['user_bio'].notna().astype(int)

In [15]:
tweet['len_bio'] = [len(str(x)) for x in tweet['user_bio']]

In [16]:
def is_numeric(string):
    count = 0
    for s in string:
        try:
            int(s)
            count += 1
        except:
            count = count
    return count / len(string)

In [17]:
tweet['ratio_num_user'] = tweet['author'].apply(is_numeric)

In [18]:
tweet['target'] = tweet['reply_count'] + tweet['favorite_count'] + tweet['retweet_count']

In [19]:
tweet['target']

0        289
1          3
2          1
3         55
4          0
        ... 
30295      0
30296      0
30297     16
30298      0
30299      0
Name: target, Length: 30300, dtype: int64

In [20]:
tweet.drop(columns = ['user_bio', 'user_location', 'user_url'], inplace = True)

In [21]:
tweet['user_favorites'] = tweet['user_favorites'].fillna(0)

In [22]:
tweet[tweet['ratio'].isna()][['user_followers', 'user_following']]

Unnamed: 0,user_followers,user_following
84,0,0
115,0,0
176,0,0
195,0,0
211,0,0
...,...,...
30001,0,0
30093,0,0
30168,0,0
30169,0,0


All of the rows where 'ratio' is zero, the user followers and following is also zero. It would make the most sense to impute 0 there.

In [23]:
tweet['ratio'] = tweet['ratio'].fillna(0)

In [24]:
tweet.dropna(inplace = True)

Engineering a few interaction terms:

In [25]:
tweet['sent*2'] = tweet['text_sentiment'] **2
tweet['followers*2'] = tweet['user_followers'] **2
tweet['ratio*2'] = tweet['ratio'] **2
tweet['feelings*bio'] = tweet['big_feelings'] * tweet['len_bio']

In [26]:
sent = SentimentIntensityAnalyzer()

In [27]:
def emotion_range(string):
    emo_r = np.abs(sent.polarity_scores(string)['neg']) + sent.polarity_scores(string)['pos']
    return emo_r / 10

In [28]:
tweet['emotional_range'] = tweet['text'].apply(emotion_range)

In [29]:
sum(tweet.isna().sum())

0

These are the numeric features we'll use:

In [30]:
num_features = ['not_english', 'user_tweets', 
                            'user_favorites', 'has_url', 
                            'sent*2', 'followers*2', 'ratio', 
                            'feelings*bio', 'emotional_range' ]


Vectorizing the text so we can use it to model:

In [31]:
tf = TfidfVectorizer(stop_words=None,
                       max_features = 150,
                       min_df= 1,
                       ngram_range=(1,2)) 

tf.fit(tweet['text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=150,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [32]:
text_features = tf.transform(tweet['text'])

In [33]:
text_df = pd.DataFrame(text_features.toarray(),
                          columns=tf.get_feature_names())

In [34]:
text_df = text_df.reset_index()

In [35]:
num_df = tweet[num_features].reset_index()

In [36]:
print(text_df.shape)
print(num_df.shape)

(30299, 151)
(30299, 10)


In [37]:
ss = StandardScaler()
ss.fit(num_df)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [38]:
X = pd.concat([text_df, num_df], axis = 1)
y = tweet['retweet_count'] + tweet['favorite_count']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 21)

In [40]:
et = ExtraTreesRegressor(random_state = 42)
params = {
    'n_estimators': [50, 100],
    'max_features': [None, 'auto'],
    'max_depth': [None, 2, 3, 4]
}
gs = GridSearchCV(et, param_grid = params, cv=5)
gs.fit(X_train, y_train)
print(gs.best_score_)

-0.17951048043568424


Even grid searching over a few parameters has not helped our score. One last regression method below:

In [41]:
bag = BaggingRegressor()

In [42]:
bag.fit(X_train, y_train)

BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=False,
                 max_features=1.0, max_samples=1.0, n_estimators=10,
                 n_jobs=None, oob_score=False, random_state=None, verbose=0,
                 warm_start=False)

In [43]:
bag.score(X_train, y_train)

0.6784594950119205

In [44]:
bag.score(X_test, y_test)

-0.9035427390168047

Again, these results discourage us from thinking we can predict the target with a supervised method. In the next notebook we'll try with unsupervised clusters.