# Imports
All the imports needed for our project

In [96]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from os import listdir
from os.path import isfile, join
from random import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
import numpy as np # linear algebra
from joblib import dump, load # used for saving and loading sklearn objects
from scipy.sparse import save_npz, load_npz # used for saving and loading sparse matrices
from scipy.stats import uniform
from scipy.sparse import csr_matrix
from functools import reduce

##1.2 Import from our code

In [83]:
from extract_dataframe import read_json
from extract_dataframe import extract_data_file
from extract_dataframe import TweetDfExtractor
from extract_dataframe import path_to_file
from clean_tweets_dataframe import Clean_Tweets


In [84]:
#extract data file if not yet extracted
extract_data_file()

# 2. Extract DataFrame

In [85]:
_, tweet_list = read_json("./data/Economic_Twitter_Data.json")
tweet = TweetDfExtractor(tweet_list)
tweet_df = tweet.get_tweet_df(True)

File Successfully Saved.!!!


In [86]:
tweet_df.head(n=10)

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,Fri Apr 22 22:20:18 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @nikitheblogger: Irre: Annalena Baerbock sa...,0.0,0.0,de,0,355,McMc74078966,3,12,,,@nikitheblogger,
1,Fri Apr 22 22:19:16 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @sagt_mit: Merkel schaffte es in 1 Jahr 1 M...,0.0,0.0,de,0,505,McMc74078966,3,12,,,@sagt_mit,
2,Fri Apr 22 22:17:28 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @Kryptonoun: @WRi007 Pharma in Lebensmittel...,0.0,0.0,de,0,4,McMc74078966,3,12,,,"@Kryptonoun, @WRi007",
3,Fri Apr 22 22:17:20 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @WRi007: Die #Deutschen sind ein braves Vol...,0.0,0.0,de,0,332,McMc74078966,3,12,,#Deutschen #Spritpreisen #inflation #Abgaben,@WRi007,
4,Fri Apr 22 22:13:15 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @RolandTichy: Baerbock verkündet mal so neb...,0.0,0.0,de,0,386,McMc74078966,3,12,,,@RolandTichy,
5,Fri Apr 22 22:12:51 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @biggyyyyyyy: https://t.co/jvvA7F9m4U,0.0,0.0,und,0,77,McMc74078966,3,12,False,,@biggyyyyyyy,
6,Fri Apr 22 22:12:17 +0000 2022,"<a href=""http://twitter.com/download/android"" ...","RT @SiriaBlack_II: Ich bin mir sicher, dass vi...",0.0,0.0,de,0,129,McMc74078966,3,12,,#Baerbock,@SiriaBlack_II,
7,Fri Apr 22 22:10:01 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @Darling_2022: Ich wiederhole mich gerne no...,0.0,0.0,de,0,860,McMc74078966,3,12,,,@Darling_2022,
8,Fri Apr 22 22:09:38 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @rosenbusch_: Brigadegeneral a. D. Erich Va...,0.0,1.0,de,0,926,McMc74078966,3,12,,,@rosenbusch_,
9,Fri Apr 22 22:04:16 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @kickout55711802: Klasse Rede über die Fern...,0.0,0.0,de,0,285,McMc74078966,3,12,False,,@kickout55711802,


## 3. Remove unwanted columns and irrelavant tweets


## 3.1 drop duplicates

In [87]:
cleanTweetsWrapper = Clean_Tweets(tweet_df)
print('Length before dropping duplicates: ' + str(len(cleanTweetsWrapper.df)))
cleanTweetsWrapper.drop_duplicate()
print('Length after dropping duplicates: ' + str(len(cleanTweetsWrapper.df)))


Automation in Action...!!!
Length before dropping duplicates: 24625
Length after dropping duplicates: 22481


In [88]:
print(cleanTweetsWrapper.df.loc[0, "original_text"])

RT @nikitheblogger: Irre: Annalena Baerbock sagt, es bricht ihr das Herz, dass man nicht bedingungslos schwere Waffen liefert.
Mir bricht e…


### 3.2 Remove Non-English Tweets

In [91]:
%%time
print('Length before removing non-English: ' + str(len(cleanTweetsWrapper.df)))
cleanTweetsWrapper.remove_non_english_tweets()
print('Length after removing non-English: ' + str(len(cleanTweetsWrapper.df)))

Length before removing non-English: 15180
Length after removing non-English: 15180
Wall time: 139 ms


In [92]:
cleanTweetsWrapper.df.head(n=2)

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
38,Fri Apr 22 22:17:05 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @NorthstarCharts: The 10-year yield is tell...,0.16,0.54,en,0,43,davideiacovozzi,18,55,,#gold #silver #crypto,@NorthstarCharts,
39,Fri Apr 22 13:44:53 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @MichaelAArouet: German 10y mortgage rate w...,0.15,0.175,en,0,32,davideiacovozzi,18,55,,,@MichaelAArouet,


# 4. Construct 'bag-of-words'

We need to go from Tweet DataFrames to Tweet actual text and then to numeric values

In [95]:
texts = [text for text in cleanTweetsWrapper.df['original_text']]
words = [t.split() for t in texts]
print(words)



## Now we need to flaten the list
https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists

In [None]:
wordList = reduce(lambda x, y: x+y, words)
print(wordList)