<p style="color: blue;border: 1px solid black; text-align: center; font-size: 30px ">
 Social Network Analysis of Disinformation/Influence Operation from Bots  
</p>

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
import nltk
from wordcloud import WordCloud,STOPWORDS
import plotly.express as px
from collections import Counter

In [None]:


training_data = pd.read_csv('../input/twitter-hate-speech/train_E6oV3lV.csv')


training_data['length'] = training_data['tweet'].apply(len)

def vectorization(table):
    #CountVectorizer will convert a collection of text documents to a matrix of token counts
    #Produces a sparse representation of the counts 
    #Initialize
    vector = CountVectorizer()
    #We fit and transform the vector created
    frequency_matrix = vector.fit_transform(table.tweet)
    #Sum all the frequencies for each word
    sum_frequencies = np.sum(frequency_matrix, axis=0)
    #Now we use squeeze to remove single-dimensional entries from the shape of an array that we got from applying np.asarray to
    #the sum of frequencies.
    frequency = np.squeeze(np.asarray(sum_frequencies))
    #Now we get into a dataframe all the frequencies and the words that they correspond to
    frequency_df = pd.DataFrame([frequency], columns=vector.get_feature_names()).transpose()
    return frequency_df


def graph(word_frequency, sent):
    labels = word_frequency[0][1:51].index
    title = "Frequency of Negative Words"
    #Plot the figures
    plt.figure(figsize=(15,8))
    plt.bar(np.arange(50), word_frequency[0][1:51], width = 0.8, color = sns.color_palette("bwr"), alpha=0.5, 
            edgecolor = "black", capsize=8, linewidth=1);
    
    plt.xticks(np.arange(50), labels, rotation=90, size=14);
    plt.xlabel("50 more frequent words", size=14);
    plt.ylabel("Frequency", size=14);
    #plt.title('Word Frequency for %s', size=18) %sent;
    plt.title(title, size=18)
    plt.grid(False);
    plt.gca().spines["top"].set_visible(False);
    plt.gca().spines["right"].set_visible(False);
    plt.show()

    
word_frequency_neg = vectorization(training_data[training_data['label'] == 1]).sort_values(0, ascending = False)

graph(word_frequency_neg, 'negative')

<p style="color: blue;border: 1px solid black; text-align: center; font-size: 20px ">
 Bots are usualy tweet with the same words , so if we look at the frequency of the repeated words we may find an interisting patterns 
</p>


In [None]:



train  = pd.read_csv("../input/twitter-hate-speech/train_E6oV3lV.csv")

train['cleaned_tweet'] = train.tweet.apply(lambda x: ' '.join([word for word in x.split() if not word.startswith('@')]))


negative_words = ' '.join([word for word in train['cleaned_tweet'][train['label'] == 1]])
neg_htag = [htag for htag in negative_words.split() if htag.startswith('#')]
neg_htag = [neg_htag[i][1:] for i in range(len(neg_htag))]
neg_htag_freqcount = nltk.FreqDist(neg_htag)
neg_htag_df = pd.DataFrame({'Hashtag' : list(neg_htag_freqcount.keys()),
                            'Count' : list(neg_htag_freqcount.values())})


most_frequent = neg_htag_df.nlargest(columns="Count", n = 20)

plt.figure(figsize=(16,5))
ax = sns.barplot(data=most_frequent, x= "Hashtag", y = "Count")
plt.title ("Frequency of Negative Hashtags" , fontsize = 15)
plt.show()

<p style="color: blue;border: 1px solid black; text-align: center; font-size: 20px ">
What if we checked the Hashtages with the same concept
</p>

In [None]:


negative_words = ' '.join([word for word in train['cleaned_tweet'][train['label'] == 1]])
wordcloud = WordCloud(width = 1500, height = 800, max_font_size = 110).generate(negative_words)
plt.figure(figsize= (12,8))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Visualizing the Negative Words' , fontsize = 15)
plt.show()

<p style="color: blue;border: 1px solid black; text-align: center; font-size: 20px ">
Let's visualize the negative words
</p>

In [None]:
trump = pd.read_csv('/kaggle/input/private/privet.csv')
trump = trump.drop('Unnamed: 0', axis=1)



months = []
datas = []
trump['year'] = [o[:4] for o in trump['date']]
for i in trump.groupby('year'):
    i[1]['month'] = [j[5:7] for j in i[1]['date']]
    for k in i[1].groupby('month'):
        months.append(i[0]+'-'+k[0])
        datas.append(k[1]['tweet'])
        
month_list = []
for i in datas:
    length = len(list(i))
    month_count = 0 
    for j in i:
        month_count += len(j.split(' '))
    month_list.append(round(month_count/length, 2))




df = pd.DataFrame({'Date of tweet':months, 'Average number of words per tweet':month_list})
fig = px.line(df, 'Date of tweet', 'Average number of words per tweet', title='Number of words in tweets per month')
fig.show()



<p style="color: blue;border: 1px solid black; text-align: center; font-size: 20px ">
Counting the number of words per tweets may show us something interisting 
</p>

In [None]:


months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
trump['year'] = 2015

for year in trump.groupby('year'):
    count = Counter([int(j[5:7]) for j in year[1]['date']])
    df = pd.DataFrame({'Month':count.keys(), 'Number of tweets':count.values()})

    month_list = []
    for i in df['Month']:
        month_list.append(months[int(i-1)])
    df['Month'] = month_list

    fig = px.pie(df, 'Month', 'Number of tweets',  title='Percentage of Negative tweets per Months')
    fig.update_layout(legend_title=dict(text='Months in '+str(year[0]), font=dict(size=18)))
    fig.show()



<p style="color: blue;border: 1px solid black; text-align: center; font-size: 20px ">
That would be useful if we checked the percentage of Negative tweets per Months
</p>