In [None]:
import csv
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import os
import emoji
import pickle

In [None]:
# Load all the data in a Dataframe
def load_df(rootdir):
    tweets = []
    count = 0
    cols = []
    for subdir, dirs, files in os.walk(rootdir):
        for file in tqdm(files):
            count += 1
            filename = subdir + os.sep + file
            with open(filename,'r',encoding="utf8") as f:
                data = csv.reader(f)
                for row in data:
                    if len(row) < 12:
                        continue
                    if row[11] == 'tweet_text':
                        cols = row
                        continue
                    tweets.append(row)
    df = pd.DataFrame(tweets, columns=cols)
    return df

tw = load_df('deleted_accounts')

In [None]:
pd.set_option('display.max_colwidth', -1)
tw[tw.name=='Divya Spandana/Ramya']

In [None]:
"""[days since account creation, number of followers, number of friends (same as followings), number of favorites,
number of tweets, friend-to-follow ratio, name length in chars, bio in chars, screen name length in chars, 
screen name length in words, bio length words, avg. number of tweets per hour]"""

In [None]:
#extract average tweets per hour
"""['account_id', 'followers', 'friends', 'favourites', 'account_creation', 'is_verified', 'name', 'screen_name', 
'description', 'tweet_id', 'tweet_creation', 'tweet_text', 'is_RT']"""
def extract_profile_features(root_dir):
    features = {} 
    ctime = pd.Timestamp.now(tz='UTC')
    for subdir, dirs, files in os.walk(rootdir):
        for file in tqdm(files):
            filename = subdir + os.sep + file
            with open(filename,'r')as f:
                data = csv.reader(f)
                r = 0
                last_tweet = 0
                cur_hr = 1.0
                avg_list = [] 
                userid = 0
                usr_features = {}
                for row in data:
                    r+=1
                    if r == 1:
                        continue                     
                    #first tweet (the latest)
                    try:
                        otime = pd.to_datetime(row[10])
                        cur_time = (ctime-otime).total_seconds()
                        if abs(cur_time-last_tweet) <= 3600:
                            cur_hr += 1.0
                        else:
                            last_tweet = cur_time
                            avg_list.append(cur_hr)
                            cur_hr = 1.0
                        if r == 2:
                            userid = row[0]
                            usr_features['followers'] = row[1]
                            usr_features['time_since_creation'] = (ctime - pd.to_datetime(row[4])).total_seconds()
                            usr_features['friends'] = row[2]
                            usr_features['favourites'] = row[3]
                            usr_features['friend_to_follow'] = float(row[2])/(float(row[1])+1)
                            usr_features['name_len'] = len(row[6])
                            usr_features['bio_len'] = len(row[8])
                            usr_features['scr_len'] = len(row[7])
                            usr_features['scr_len_words'] = len(row[7].strip(' ').split(' '))
                            usr_features['bio_len_words'] = len(row[8].strip(' ').split(' '))
                    except:
                        print(row)
                        print(file)


                usr_features['no_of_tweets'] = r - 1
                if len(avg_list) == 0:
                    avg_list.append(0)
                usr_features['avg_tweets'] = sum(avg_list)/(len(avg_list))
                features[userid] = usr_features
    return features
    


In [None]:
profile_features = extract_profile_features('/Deleted_Accounts/deleted_accounts/')

In [None]:
#extract syntactic features
#  find emojis [i for i in x.split() if unicode(i, "utf-8") in emoji.UNICODE_EMOJI]

In [None]:
rootdir = '/Deleted_Accounts/deleted_accounts/'
features = [] 
ctime = pd.Timestamp.now(tz='UTC')
for subdir, dirs, files in os.walk(rootdir):
    for file in tqdm(files):
        filename = subdir + os.sep + file
        with open(filename,'r')as f:
            data = csv.reader(f)
            for row in data:
                try:
                    if ' hack ' in row[-2] or ' hacked ' in row[-2]:
                        if 'EVM' not in row[-2]:
                            features.append(row[-2])
                except:
                    pass

In [None]:
features

In [None]:
rootdir = 'normal_accounts'
count = 0
tweets = list()

for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        count += 1
        filename = subdir + os.sep + file
        with open(filename,'r',encoding="utf8") as f:
            print(filename)
            data = csv.reader(f)
            for row in data:
                if len(row) < 12:
                    continue
                tweets.append(row[11])
                

In [None]:
df = pd.DataFrame(tweets, columns=['tweet_text'])  
df.to_pickle('deleted_tweets.pickle')

In [None]:
from nltk.corpus import words
from nltk.corpus import wordnet 

In [None]:
wrd = 'Modi'
wrd in words.words() or wrd in wordnet.words()

In [None]:
import re
def remove_links(tweet):
    
    tweet = re.sub(r'http\S+', '', tweet) 
    tweet = re.sub(r'bit.ly/\S+', '', tweet)
    tweet = tweet.strip('[link]') 
    return tweet

def remove_users(tweet):
    
    tweet = re.sub('(RT\s[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) 
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) 
    return tweet

def clean_tweet(tweet, bigrams=False):
    
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() 
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet)
    tweet = re.sub('\s+', ' ', tweet) 
    tweet = re.sub('([0-9]+)', '', tweet) 
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords] 

    tweet_token_list = [word_rooter(word) if '#' not in word else word
                        for word in tweet_token_list] 
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet 

def find_hashtags(tweet):    
    return re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', tweet)  


In [None]:
df = pd.read_pickle('deleted_tweets.pickle')

In [None]:
my_stopwords = nltk.corpus.stopwords.words('english')
swords = open('stopwords.txt','r').read().split('\n')
for word in swords:
    if word not in my_stopwords:
        my_stopwords.append(word)
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'
df['clean_tweet'] = df.tweet_text.apply(clean_tweet)
df['clean_text'].to_pickle('suspended_clean_tweets.pickle')

In [None]:
hinglish = []
threshold = 0.5
total = 0
for i in tqdm(range(1,1000)):
    clean = clean_tweet(df['tweet_text'][i])
    if clean == 'tweet_text':
        continue
    total += 1
    tokens = re.split(r'[^A-Za-z0-9]+',clean)
    coun = 0
    for tok in tokens:
        if tok not in words.words() and tok not in wordnet.words():
            coun += 1
        if float(coun)/float(len(tokens)) > threshold:
            break
    if float(coun)/float(len(tokens)) <= threshold:
        hinglish.append(df['tweet_text'][i])
print(str(len(hinglish))+"/"+str(total))

with open('hinglish.txt','w+') as fil:
    fil.write(str(len(hinglish)))

In [None]:
hinglish

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
from collections import defaultdict
hash_array = {}
with open('top_hashtags_deleted.pkl','rb+') as fil:
    hash_array = pickle.load(fil)

In [None]:
def find_hashtags(tweet):
    return re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', tweet)  

freq_dict = defaultdict(lambda:0)
total = 0
for tweet in tweets:
    if tweet == 'tweet_text':
        continue
    total += 1
    hashes = find_hashtags(tweet)
    for tag in hashes:
        if tag in hash_array:
            freq_dict[tag] += 1

In [None]:
# sorted(freq_dict,key=freq_dict.get)
for keys in freq_dict:
    print(keys)

In [None]:
for key in sorted(freq_dict,key=freq_dict.get,reverse=True):
    print(key,freq_dict[key],round(float(freq_dict[key])/dif2,4))

In [None]:
hash_array

In [None]:
import matplotlib.pyplot as plt
plt.bar(hash_array.keys(),hash_array.values())
plt.bar(freq_dict.keys(),[val/total for val in freq_dict.values()])
plt.show()

In [None]:
freq_dict

In [None]:
hash_array

In [None]:
for key in hash_array:
    print(key,"  ",round(hash_array[key],4)," "," ",round(freq_dict[key]/dif1,4))

In [None]:
df = pd.DataFrame(columns=['Hashtag','Deleted','Normal','Difference'])
dif1 = sum(hash_array.values())
dif2 = sum(freq_dict.values())
for i,key in enumerate(hash_array.keys()):
    df.loc[i] = [key,hash_array[key]/dif1,freq_dict[key]/dif2,(hash_array[key]/dif1-freq_dict[key]/dif2)*100]

In [None]:
df.sort_values(by='Difference',ascending=False)

In [None]:
total