# Setup

In [None]:
! pip install hazm
! pip install langdetect

In [None]:
# Import needed libraries

from google.colab import drive  # to mount Google Drive to Colab notebook
import tweepy                   # Python wrapper around Twitter API
import json
import pandas as pd
import csv
from datetime import date
from datetime import datetime
import time
import matplotlib.pyplot as plt
from pandas.plotting import table

from __future__ import unicode_literals
from hazm import *
from langdetect import detect

import os
import glob
import shutil

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Twitter Data Collection

## Log into Twitter API

In [None]:
# Load Twitter API secrets from an external file
path = '/content/drive/MyDrive/Project/'
secrets = json.loads(open(path + 'secrets.json').read())  

consumer_key = secrets['consumer_key']
consumer_secret = secrets['consumer_secret']
access_token = secrets['access_token']
access_token_secret = secrets['access_token_secret']

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

## Checking the connection

In [None]:
public_tweets = api.home_timeline()
for tweet in public_tweets:
    print(tweet.text)

## Helper Functions

In [None]:
# Helper function to save data into a JSON file
# file_name: the name of the data file you want to save on your Google Drive
# file_content: the data you want to save

def save_json(file_name, file_content):
  with open(path + file_name, 'w', encoding='utf-8') as f:
    json.dump(file_content, f, ensure_ascii=False, indent=4)

In [None]:
# Helper function to handle twitter API rate limit

def limit_handled(cursor, list_name):
    while True:
        try:
            yield cursor.next()
        except tweepy.RateLimitError:
            print("\nCurrent number of data points in list = " + str(len(list_name)))
            print('Hit Twitter API rate limit.')
            for i in range(3, 0, -1):
              print("Wait for {} mins.".format(i * 5))
              time.sleep(5 * 60)
        except tweepy.error.TweepError:
            print('\nCaught TweepError exception' )

## Get All Tweets


In [None]:
def get_all_tweets(screen_name):

  # path = '/content/drive/MyDrive/Project/data/raw'


	# initialize a list to hold all the tweepy Tweets
	alltweets = []
	
	# make initial request for most recent tweets (200 is the maximum allowed count)
	new_tweets = api.user_timeline(screen_name = screen_name,count=200)
	
	# save most recent tweets
	alltweets.extend(new_tweets)
	
	# save the id of the oldest tweet less one
	oldest = alltweets[-1].id - 1
	
	# keep grabbing tweets until there are no tweets left to grab
	while len(new_tweets) > 0:
		# print("getting tweets before %s" % (oldest))
		
		# all subsiquent requests use the max_id param to prevent duplicates
		new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
		
		# save most recent tweets
		alltweets.extend(new_tweets)
		
		# update the id of the oldest tweet less one
		oldest = alltweets[-1].id - 1
		
	# print("...%s done" % (screen_name)
	
	# transform the tweepy tweets into a 2D array that will populate the csv	
	outtweets = [[tweet.id_str, tweet.created_at, tweet.text, tweet.favorite_count, 
	              tweet.in_reply_to_screen_name, tweet.retweeted] for tweet in alltweets]
	
	# write the csv	
	with open('/content/drive/MyDrive/Project/data/raw/' + '%s_tweets.csv' % screen_name, 'w') as f:
		writer = csv.writer(f)
		writer.writerow(["id","created_at","text","likes","in reply to","retweeted"])
		writer.writerows(outtweets)
	
	pass

## Data Collection Main Script

In [None]:
if __name__ == '__main__':
  users = ["halfmaleficent", "bardia_heydari", "azdivodadmalool", "behdadesfahbod",
           "elZenakarGrand3", "setarebyt", "thegarbled", "parsa_sann", "SaabiTheSab",
           "afraquotidian", "Movaghghati", "Parxya",
           "Checodara", "Th3yCallMeMamad", "CaveTheNick",
           "tantanani", "60zqueen",
           "NegarJamalifard", "aCarnivalofRust", "Sepehr_San",
           "erfanafre", "theparsius", "chameleon5421", "SheTweeting"]
  for user in users:
    try:
      get_all_tweets(user)
      print(user + " DONE")
    except tweepy.TweepError:
        print("Failed to run the command on " + user)

# Cleaning Data 

In [None]:
path = '/content/drive/MyDrive/Project/data/raw/'
clean_path = '/content/drive/MyDrive/Project/data/clean/'


filelist = [f for f in os.listdir(clean_path)]
for f in filelist:
    os.remove(os.path.join(clean_path, f))

normalizer = Normalizer()

for filename in os.listdir(path):
  if filename.endswith(".csv"): 
    drop_index = []
    tweets = pd.read_csv(path + filename)
    tweet_body = tweets['text']
    for row_index, row in tweets.iterrows():
      row = row.copy()
      cleaned = row['text'].split()
      
      created = row['created_at'].split()
      created = int(created[0].replace("-", ""))
      if created < 20201220:
        drop_index.append(row_index)
        continue
      
      # break
      for item in cleaned:
        if '@' in item or 'http' in item or ':' in item or item[0] == '=':
          cleaned.remove(item)
      glue = ' '
      cleaned = glue.join(cleaned)

      try:
        lang = detect(cleaned)
      except:
        drop_index.append(row_index)
        continue
      
      if lang != 'fa':
        drop_index.append(row_index)
        continue

      cleaned = normalizer.normalize(cleaned)
      tweets.loc[row_index, 'text'] = cleaned

    # break
    tweets = tweets.drop(drop_index)
    tweets.to_csv(clean_path + filename[:-4]+ '_clean.csv')
    print(filename + " CLEANED")

Get first and last tweet's dates to make sure we've chosen the right users.

In [None]:
os.chdir(clean_path)
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

def twtcount(filename):
  file = open(filename)
  reader = csv.reader(file)
  lines= len(list(reader))
  return lines

sum = 0
all = []
for f in all_filenames:
  test = pd.read_csv(f)
  first = test.iloc[-1]
  last = test.iloc[0]
  all.append((f, first['created_at'], last['created_at']))
  # row_count = sum(1 for row in test)
  sum += twtcount(f)

# all = sort(all)
all = sorted(all)
for i in all:
  print(i)

print(sum)

Combine all raw and clean tweets separately, and sort by date

In [None]:
def sort(path_, filename):  
  os.chdir(path_)
  extension = 'csv'
  all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

  combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
  sorted_df = combined_csv.sort_values(by=['created_at'], ascending=True)
  sorted_df.to_csv(filename, index=False, encoding='utf-8-sig')
  # print(sorted_df.head(3))

  file = open(filename)
  reader = csv.reader(file)
  lines= len(list(reader))
  print(lines)

sort(path, "000ALLRAW.csv")  
sort(clean_path, "000ALLCLEANED.csv")  


# Classification and Tokenization

In [None]:
tweets = pd.read_csv("/content/drive/MyDrive/Project/data/clean/000ALLCLEANED.csv")
tweets['class'] = "cold"
tweets['sentences'] = ""
tweets['words'] = ""
tweets.to_csv("/content/drive/MyDrive/Project/data/clean/000ALLCLEANED_COMPLETE.csv", index=False)

In [None]:
from hazm import *

sentences_list = []
words_list = []

fulldata_tweets = pd.read_csv("/content/drive/MyDrive/Project/data/clean/000ALLCLEANED_COMPLETE.csv")
# tweet_body = fulldata_tweets['text']
for row_index, row in fulldata_tweets.iterrows():
  row = row.copy()

  
  created = row['created_at'].split()
  created = int(created[0].replace("-", ""))

  text = row['text']
  sent = sent_tokenize(text)
  w = word_tokenize(text)
  sentences_list.append(sent)
  words_list.append(w)


  if created > 20210320:
    fulldata_tweets.loc[row_index, 'class'] = "warm"

    # print(row_index)
    # print(row[row_index])
    # print(row[row_index + 1])

    # break
fulldata_tweets['sentences'] = sentences_list
fulldata_tweets['words'] = words_list

fulldata_tweets.to_csv("/content/drive/MyDrive/Project/data/clean/000ALLCLEANED_COMPLETE.csv")
print("DONE")

# Statistics

**Imports**

In [None]:
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
import numpy as np
import matplotlib.pyplot as plt

fulldata = pd.read_csv("/content/drive/MyDrive/Project/data/clean/000ALLCLEANED_COMPLETE.csv")
classes = list(fulldata['class'])

**Sentence Statistics**

In [None]:
sentences = []
sent_warm = []
sent_cold = []
for i in range(len(list(fulldata['sentences']))):
    all = list(fulldata['sentences'])[i].split("', '")
    for j in all:
        if classes[i]=='cold':
            sent_cold.append(j)
        elif classes[i]=='warm':
            sent_warm.append(j)

print("Number of sentences in class cold: ",len(sent_cold))
print("Number of sentences in class warm: ",len(sent_warm))
print("Number of sentences in total: ",len(sent_cold + sent_warm))


**Single-Class Word Statistics**

In [None]:
words = []
words_cold = []
words_warm = []

for i in range(len(list(fulldata['words']))):
    t = list(fulldata['words'])[i].split("', '")
    for j in t:
        # print(j)
        j = j.replace("['", "")
        if not j.isascii():
          if '@' not in j:
            words.append(j)
            if classes[i]=='cold':
                words_cold.append(j)
            elif classes[i]=='warm':
                words_warm.append(j)
    # break
    

print("Number of words in class cold: ", len(words_cold))
print("Number of words in class warm: ", len(words_warm))
print("Number of words in total: ", len(words))

print("Number of unique words in class cold: ", len(set(words_cold)))
print("Number of unique words in class warm: ", len(set(words_warm)))
print("Number of unique words in total: ", len(set(words)))


**Inter-Class Word Statistics**

In [None]:
cold_unique = set(words_cold)
warm_unique = set(words_warm)
common_unique = cold_unique & warm_unique
unc_unique = cold_unique ^ warm_unique

unc_cold = [x for x in words_cold if x not in common_unique]
unc_ranking_cold = Counter(unc_cold)
unc_warm = [x for x in words_warm if x not in common_unique]
unc_ranking_warm = Counter(unc_warm)

c_cold = [x for x in words_cold if x in common_unique]
c_ranking_cold = Counter(c_cold)
c_warm = [x for x in words_warm if x in common_unique]
c_ranking_warm = Counter(c_warm)


print("Number of unique words common between the two classes: ", len(common_unique))
print("Number of unique words uncommon between the two classes: ", len(unc_unique))
print("Top 10 uncommon cold words: ", unc_ranking_cold.most_common(10))
print("Top 10 uncommon warm words: ", unc_ranking_warm.most_common(10))
print("Top 10 common cold words: ", c_ranking_cold.most_common(10))
print("Top 10 common warm words: ", c_ranking_warm.most_common(10))


**Relative Normalized Frequency**

In [None]:
cold_RNF = {}
warm_RNF = {}

for word in common_unique:
    cold_RNF[word]=(c_ranking_cold[word]/len(words_cold))/(c_ranking_warm[word]/len(words_warm))
    warm_RNF[word] = (c_ranking_warm[word] / len(words_warm)) / (c_ranking_cold[word] / len(words_cold))

cold_RNF=cold_RNF.items()
warm_RNF=warm_RNF.items()

sorted_cold_RNF = sorted(cold_RNF, key=lambda x: x[1],reverse=True)
sorted_warm_RNF = sorted(warm_RNF, key=lambda x: x[1],reverse=True)

print("Top 10 cold words according to RNF: ", sorted_cold_RNF[:10])
print("Top 10 warm words according to RNF: ", sorted_warm_RNF[:10])

**TF-IDF**

In [None]:
joined_cold = ' '.join(words_cold)
joined_warm = ' '.join(words_warm)
vectorizer = TfidfVectorizer(stop_words=[".", "که", "در", "با", "به","از", "هم", "و", "رو", "یه", "،", "؟", "تا", "اون", "ولی", "برای", "«", "می", "نمی", "u200cی", "u200cها", "u200cهای"])

vectors = vectorizer.fit_transform([joined_cold, joined_warm])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()

df_tfidf = pd.DataFrame(denselist, columns=feature_names)
cold_tfidf = df_tfidf.iloc[0]
warm_tfidf = df_tfidf.iloc[1]

In [None]:
print("Top 10 cold words according to TDIDF")

cold_tfidf = pd.DataFrame({"word": cold_tfidf.index, "TFIDF": cold_tfidf.values})
cold_tfidf[["word", "TFIDF"]].sort_values("TFIDF", ascending = False).head(10)



In [None]:
print("Top 10 warm words according to RNF")

warm_tfidf = pd.DataFrame({"word": warm_tfidf.index, "TFIDF": warm_tfidf.values})
warm_tfidf[["word", "TFIDF"]].sort_values("TFIDF", ascending = False).head(10)

**Histogram**

In [None]:
from pandas.plotting import table

word_counter = Counter(words)
top_list = word_counter.most_common(100)

x_labels = [val[0] for val in top_list]
y_labels = [val[1] for val in top_list]
plt.figure(figsize=(12, 6))
ax = pd.Series(y_labels).plot(kind='bar')
ax.set_xticklabels(x_labels)

rects = ax.patches
plt.show()

df = pd.DataFrame(top_list, columns=["Word", "Count"])
hist = plt.subplot(111, frame_on=False)
hist.xaxis.set_visible(False)
hist.yaxis.set_visible(False)

table(hist, df, loc='center') 
plt.show()