In [174]:
import nltk as nltk
import pandas as pd
import string
import re
import numpy as np
import matplotlib.pyplot as plt

In [177]:
from nltk.corpus import stopwords
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from collections import Counter

In [178]:
Stop_words = set (stopwords.words('English'))

In [179]:
def load_dataframe(filename):
    return pd.read_csv(filename)

df = load_dataframe('trumpTweets.csv')

In [180]:
#Printing the dataframe befor manipulation
print("The dataframe befor manipulation:")
df.head(1000)

The dataframe befor manipulation:


Unnamed: 0,text,isRetweet,favorites,retweets,date
0,Republicans and Democrats have both created ou...,f,49,255,02/08/2011 18:07
1,I was thrilled to be back in the Great city of...,f,73748,17404,03/03/2020 01:34
2,RT @CBS_Herridge: READ: Letter to surveillance...,t,0,7396,17/01/2020 03:22
3,The Unsolicited Mail In Ballot Scam is a major...,f,80527,23502,12/09/2020 20:10
4,RT @MZHemingway: Very friendly telling of even...,t,0,9081,17/01/2020 13:13
...,...,...,...,...,...
995,But 2020 is a long way from over! https://t.co...,f,187699,32040,25/11/2020 15:44
996,RT @ScottAdamsSays: The most corrupt instituti...,t,0,19473,14/11/2020 05:29
997,RT @jacobkschneider: Boston Herald endorses Pr...,t,0,6642,27/10/2020 15:29
998,RT @marklevinshow: The Boston Herald endorses ...,t,0,8716,27/10/2020 15:28


In [181]:
#Exploring the data before manipulation
print("shape of the dataframe:" ,df.shape)
print("\nThe dataframe count of null values:\n",df.isnull().sum(axis = 0))
df.describe(include='all')

shape of the dataframe: (56571, 5)

The dataframe count of null values:
 text         0
isRetweet    0
favorites    0
retweets     0
date         0
dtype: int64


Unnamed: 0,text,isRetweet,favorites,retweets,date
count,56571,56571,56571.0,56571.0,56571
unique,56118,2,,,50345
top,MAKE AMERICA GREAT AGAIN!,f,,,29/06/2020 12:12
freq,51,46694,,,10
mean,,,28349.55,8618.987467,
std,,,57815.64,13306.132408,
min,,,0.0,0.0,
25%,,,10.0,59.0,
50%,,,164.0,3450.0,
75%,,,43938.5,13014.5,


In [158]:
def tokenizeTweets(sentance):
    words = word_tokenize(sentance)
    noise_free_words = [word for word in words if word not in Stop_words]
    return noise_free_words

In [159]:
def cleanPunctuation(sentance):
    new_sentance = sentance.translate(str.maketrans('', '', string.punctuation))
    return new_sentance

In [160]:
def cleanLinks(sentance):
    sentance = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', sentance, flags=re.MULTILINE)
    return(sentance)

In [182]:
def stemming(wordsList):
    #getting the root of every word for all tweets
    rootList = []
    ps = PorterStemmer()
    for w in wordsList:
        #remove the 'Retweet' tag
        if (w != 'RT'):
            rootList.append(ps.stem(w))
    return rootList

In [183]:
def lemmataizeSentance(wordsList):
    #getting the root of every word for all tweets
    rootList = []
    wl = WordNetLemmatizer()
    for w in wordsList:
        #remove the 'Retweet' tag
        if (w != 'RT'):
            rootList.append(wl.lemmatize(w))
    return rootList

In [184]:
def chunking(wordsList):
    PartsOfSpeech = pos_tag(wordsList)
    Named_entities = ne_chunk(PartsOfSpeech)
    return Named_entities

In [185]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [206]:
def remove_unnecessary_data(dataframe):
    df = dataframe.copy()
    
    #remove duplicate rows
    df = df.drop_duplicates(keep='first').copy() 
    
    #remove emojies and special characters
    
    
    df_chunk = df.copy()
    df_stem = df.copy()
    df_lemmatize = df.copy()
    
    #removing links, punc and tokenzing the words and making 3 dfs in 3 methods
    for i in range(0,10):#len(df.index)):
        temp = df['text'][i]
        temp = remove_emoji(temp)
        temp = cleanLinks(temp)
        temp = cleanPunctuation(temp)
        temp = tokenizeTweets(temp)
        if not temp:
            df = df.drop(labels=i, axis=0)
        #df_chunk['text'][i] = chunking(temp)
        df_stem['text'][i] = stemming(temp)
        #df_lemmatize['text'][i] = lemmataizeSentance(temp)
    print(df_stem)
    return df_stem #, df_lemmatize, df_chunk

In [209]:
df_clean = remove_unnecessary_data(df)
print(df_clean['text'][7])
print(df_clean['text'][8])
print(df_clean['text'][9])
print(df_clean['text'][10])

                                                    text isRetweet  favorites  \
0         [republican, democrat, creat, econom, problem]         f         49   
1      [i, thrill, back, great, citi, charlott, north...         f      73748   
2      [cbsherridg, read, letter, surveil, court, obt...         t          0   
3      [the, unsolicit, mail, in, ballot, scam, major...         f      80527   
4      [mzhemingway, veri, friendli, tell, event, com...         t          0   
...                                                  ...       ...        ...   
56566  RT @RandPaul: I don’t know why @JoeBiden think...         t          0   
56567  RT @EliseStefanik: President @realDonaldTrump ...         t          0   
56568  RT @TeamTrump: LIVE: Presidential Debate #Deba...         t          0   
56569  Just signed an order to support the workers of...         f     176289   
56570  Suburban women want Safety &amp; Security. Joe...         f      95169   

       retweets            

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stem['text'][i] = stemming(temp)


In [208]:
df_clean.to_csv(r'C:\Users\Matan\Desktop\BCS\year 2\2nd semester\data science\my_data.csv', index=True)
print(df_clean['text'][7])

[]


In [205]:
def vectorize_string(sentance):
    texts=[":)"]
    cv = CountVectorizer(stop_words=None)
    vectorizer.fit(sentance)
    vector = vectorizer.transform(sentance)
    return vector

In [127]:
def vectorize_allTweets(dataframe):
    df = dataframe.copy()
    for i in range(0,len(df.index)):
        temp = df['text'][i]
        df['text'][i] = vectorize_string(temp)
        print(df['text'][i])
        print(i)
    print(df)   
    return df

In [128]:
print(df_clean)
df_vectorized = vectorize_allTweets(df_clean)

                                                    text isRetweet  favorites  \
0         [republican, democrat, creat, econom, problem]         f         49   
1      [i, thrill, back, great, citi, charlott, north...         f      73748   
2      [cbsherridg, read, letter, surveil, court, obt...         t          0   
3      [the, unsolicit, mail, in, ballot, scam, major...         f      80527   
4      [mzhemingway, veri, friendli, tell, event, com...         t          0   
...                                                  ...       ...        ...   
56566  [randpaul, i, ’, know, joebiden, think, contin...         t          0   
56567  [elisestefanik, presid, realdonaldtrump, excel...         t          0   
56568  [teamtrump, live, presidenti, debat, debates20...         t          0   
56569  [just, sign, order, support, worker, delphi, c...         f     176289   
56570  [suburban, women, want, safeti, amp, secur, jo...         f      95169   

       retweets            

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'][i] = vectorize_string(temp)


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [129]:
scaling=StandardScaler()
scaling.fit(temp1['text'][1])
Scaled_data=scaling.transform(temp1['text'][1])
principal=PCA(n_components=3)
principal.fit(Scaled_data)
x=principal.transform(Scaled_data)
print(x.shape)

ValueError: could not convert string to float: 'i'

In [201]:
a = []
if not a:
    print(True)
else:
    print(False)

True
