In [6]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

__Creating functions to clean tweets__

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.wordnet import WordNetLemmatizer




def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def stem_and_lemmatize(words):
    stems = stem_words(words)
    return " ".join(stems)
    lemmas = lemmatize_verbs(stems)
    return " ".join(lemmas)




def clean_string(mystr):
    mystr=mystr.lower()
    mystr=re.sub(r"\\\w+", " ", mystr)
    mystr=re.sub(r"\@\w+"," ",mystr)
    mystr=re.sub(r"\#\w+"," ",mystr)
    mystr=re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"," ",mystr)
    
    mystr=mystr[2:-1]
    
    cleantext = "".join([x.lower() if (x.isalpha() or x.isspace()) else ' ' for x in mystr])
    
    return cleantext

def remove_stop_words(mystr):
    word_list= mystr.split()
    cleaned_word_list=[w for w in word_list if w not in stop_words and len(w)>=2]
    
    return cleaned_word_list
   
    

   


__Reading files containing tweets__

In [8]:

import os
goal_dir = os.path.join(os.getcwd(), "tweets_raw/")

for filename in os.listdir(goal_dir):
    if filename.endswith(".csv"): 
         print(filename)

eBay_tweets.csv
eBay_tweets_drop2.csv
facebook_tweets.csv
facebook_tweets_drop2.csv
FoodandTravelEd_tweets.csv
FoodandTravelEd_tweets_drop2.csv
ladygaga_tweets.csv
ladygaga_tweets_drop2.csv
MTV_tweets.csv
MTV_tweets_drop2.csv
nytimes_tweets.csv
nytimes_tweets_drop2.csv
parenting_tweets.csv
parenting_tweets_drop2.csv
premierleague_tweets.csv
premierleague_tweets_drop2.csv
tesla_tweets.csv
tesla_tweets_drop2.csv
usedgov_tweets.csv
usedgov_tweets_drop2.csv


__Creating tfidf matrix for every user account__

In [81]:
from numpy import asarray
from numpy import savetxt
import numpy as np
        
goal_dir = os.path.join(os.getcwd(), "tweets_raw/")

import os

dataframes = {}

for filename in os.listdir(goal_dir):
    if filename.endswith(".csv"): 
        file = pd.read_csv("tweets_raw/"+filename)
        file['clean_text']=file['text'].apply(clean_string)
        file['word list']=file['clean_text'].apply(remove_stop_words)
        file['cleaned_word_list']=file['word list'].apply(stem_and_lemmatize)
        
        del file['word list']

        vectorizer = TfidfVectorizer()
        tfidfmatrix = vectorizer.fit_transform(file['cleaned_word_list'])

        vocab = vectorizer.get_feature_names()
        tfidf_data=tfidfmatrix.toarray()
        
        tfidf_pd=pd.DataFrame(data=tfidf_data,columns=vocab,index=file['id'])
        
        dataframes[filename] = tfidf_pd       



In [10]:
dataframes.keys()

dict_keys(['eBay_tweets.csv', 'eBay_tweets_drop2.csv', 'facebook_tweets.csv', 'facebook_tweets_drop2.csv', 'FoodandTravelEd_tweets.csv', 'FoodandTravelEd_tweets_drop2.csv', 'ladygaga_tweets.csv', 'ladygaga_tweets_drop2.csv', 'MTV_tweets.csv', 'MTV_tweets_drop2.csv', 'nytimes_tweets.csv', 'nytimes_tweets_drop2.csv', 'parenting_tweets.csv', 'parenting_tweets_drop2.csv', 'premierleague_tweets.csv', 'premierleague_tweets_drop2.csv', 'tesla_tweets.csv', 'tesla_tweets_drop2.csv', 'usedgov_tweets.csv', 'usedgov_tweets_drop2.csv'])

In [11]:
tesla_tfidf = dataframes['tesla_tweets.csv']
tesla_tfidf_2 = dataframes['tesla_tweets_drop2.csv']

In [12]:
ebday_tfidf = dataframes['eBay_tweets.csv']
fb_tfidf = dataframes['facebook_tweets.csv']
fnt_tfidf = dataframes['FoodandTravelEd_tweets.csv']
gaga_tfidf = dataframes['ladygaga_tweets.csv']
mtv_tfidf = dataframes['MTV_tweets.csv']
parenting_tfidf = dataframes['parenting_tweets.csv']
fpl_tfidf = dataframes['premierleague_tweets.csv']
gov_tfidf = dataframes['usedgov_tweets.csv']
nyt_tfidf = dataframes['nytimes_tweets.csv']

ebday_tfidf_2 = dataframes['eBay_tweets_drop2.csv']
fb_tfidf_2 = dataframes['facebook_tweets_drop2.csv']
fnt_tfidf_2 = dataframes['FoodandTravelEd_tweets_drop2.csv']
gaga_tfidf_2 = dataframes['ladygaga_tweets_drop2.csv']
mtv_tfidf_2 = dataframes['MTV_tweets_drop2.csv']
parenting_tfidf_2 = dataframes['parenting_tweets_drop2.csv']
fpl_tfidf_2 = dataframes['premierleague_tweets_drop2.csv']
gov_tfidf_2 = dataframes['usedgov_tweets_drop2.csv']
nyt_tfidf_2 = dataframes['nytimes_tweets_drop2.csv']

__Setting appropriate labels__

In [13]:
tesla_tfidf['label'] = 1
tesla_tfidf_2['label'] = 1

In [14]:
fb_tfidf['label'] = 0
fnt_tfidf['label'] = 0
ebday_tfidf['label'] = 0
gaga_tfidf['label'] = 0
mtv_tfidf['label'] = 0
nyt_tfidf['label'] = 0
parenting_tfidf['label'] = 0
fpl_tfidf['label'] = 0
gov_tfidf['label'] = 0

fb_tfidf_2['label'] = 0
fnt_tfidf_2['label'] = 0
ebday_tfidf_2['label'] = 0
gaga_tfidf_2['label'] = 0
mtv_tfidf_2['label'] = 0
nyt_tfidf_2['label'] = 0
parenting_tfidf_2['label'] = 0
fpl_tfidf_2['label'] = 0
gov_tfidf_2['label'] = 0

__Combining all tfidf matrices__

In [15]:
vertical_stack = pd.concat([tesla_tfidf, fb_tfidf,fnt_tfidf,ebday_tfidf,gaga_tfidf,mtv_tfidf,parenting_tfidf,fpl_tfidf,gov_tfidf,nyt_tfidf,tesla_tfidf_2, fb_tfidf_2,fnt_tfidf_2,ebday_tfidf_2,gaga_tfidf_2,mtv_tfidf_2,parenting_tfidf_2,fpl_tfidf_2,gov_tfidf_2,nyt_tfidf_2], axis=0)
vertical_stack = vertical_stack.fillna(0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [None]:
vertical_stack.shape

In [16]:
vertical_stack=vertical_stack.drop_duplicates().reset_index(drop=True)
y = vertical_stack['label']
vertical_stack = vertical_stack.drop('label', 1)

In [None]:
vertical_stack.shape

__Performing Chi-square for variable selection__

In [27]:
from sklearn.feature_selection import chi2

In [None]:
chi2score = chi2(vertical_stack, y)

In [None]:
import matplotlib. pyplot as plt
plt.figure(figsize=(15,10))
wscores = zip(vertical_stack.columns, chi2score[0])
wchi2 = sorted(wscores, key=lambda x:x[1])
topchi2 = list(zip(*wchi2[-50:]))
x = range(len(topchi2[1]))
labels = topchi2[0]
plt.barh(x,topchi2[1], align='center', alpha=0.2)
plt.plot(topchi2[1], x, '-o', markersize=5, alpha=0.8)
plt.yticks(x, labels)
plt.xlabel('$\chi^2$')

__Splitting data into Training and Testing Data__

In [52]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vertical_stack, y, test_size=0.20)

In [53]:
from collections import Counter
counter_train = Counter(y_train)
counter_test = Counter(y_test)

__Oversampling test data__

In [None]:
conda install -c glemaitre imbalanced-learn

In [17]:
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE

In [18]:
oversample = SMOTE()

In [65]:
X_new, y_new = oversample.fit_resample(X_train, y_train)

Feature Selection using results from Chi-square

In [83]:
X_select = X_new[vertical_stack.columns[np.argsort(chi2score[0])[::-1]][:1000]]

In [84]:
X_select.shape

(79400, 1000)

In [85]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

__Classification and prediction__

In [None]:
classifier = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=3),
    n_estimators=200
)
classifier.fit(X_select, y_new)

In [None]:
predictions = classifier.predict(X_test[vertical_stack.columns[np.argsort(chi2score[0])[::-1]][:1000]])

In [None]:
confusion_matrix(y_test, predictions)


In [None]:
from sklearn.metrics import f1_score
f1_score(y_test, predictions, average='binary')

__Extras__

In [None]:
X_new.to_csv('X_train_oversampled.csv', index = False, header=True)
y_new.to_csv('y_train_oversampled.csv', index = False, header=True)
X_test.to_csv('X_test.csv', index = False, header=True)
y_test.to_csv('y_test.csv', index = False, header=True)

In [79]:
X_select.shape

(44204, 1000)

In [None]:
goal_dir = os.path.join(os.getcwd(), "tweets_of_teslafollowers/")

import os

tesla_followers = {}

for filename in os.listdir(goal_dir):
    if filename.endswith(".csv"): 
        file = pd.read_csv("tweets_of_teslafollowers/"+filename)
        file['clean_text']=file['text'].apply(clean_string)
        file['word list']=file['clean_text'].apply(remove_stop_words)
        file['cleaned_word_list']=file['word list'].apply(stem_and_lemmatize)
        
        del file['word list']

        vectorizer = TfidfVectorizer()
        try:
            tfidfmatrix = vectorizer.fit_transform(file['cleaned_word_list'])
            vocab = vectorizer.get_feature_names()
            tfidf_data=tfidfmatrix.toarray()
            tfidf_pd=pd.DataFrame(data=tfidf_data,columns=vocab,index=file['id'])
            tfidf_pd = tfidf_pd.drop(tfidf_pd.columns.difference(vertical_stack.columns[np.argsort(chi2score[0])[::-1]][:5000]),1)
            tesla_followers[filename] = tfidf_pd     
        except:
            print(file['cleaned_word_list'])

        

In [None]:
tesla_followers

In [None]:
for i in tesla_followers.values():
    print(i.shape)

In [None]:
file['clean_text']

In [None]:
tfidf_pd.columns.difference(vertical_stack.columns[np.argsort(chi2score[0])[::-1]][:2500])
tfidf_pd.drop(tfidf_pd.columns.difference(vertical_stack.columns[np.argsort(chi2score[0])[::-1]][:2500]),1)