In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np
import json
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import emoji
import string


            
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

text_data = []
users_in_community = {}

stop_words = set(stopwords.words('english'))
stop_words.add('aa')
stop_words.add('aaaahhh')
stop_words.add('aah')
stop_words.add('aam')
stop_words.add('aap')
communities = ["Chess","Fashion","AW","UCFSports_UCF Football_list"]

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:


def clean_tweets(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
 
    tweet = re.sub(r'\@\w+|\#','', tweet)

    tweet = tweet.translate(str.maketrans('', '', string.punctuation))

    tweet = emoji.demojize(tweet)
    tweet = re.sub(r':[a-z_&]+:', '', tweet)
    tweet = re.sub(r'\d+', '', tweet)
    tweet_tokens = word_tokenize(tweet)

    filtered_words = [word for word in tweet_tokens if word not in stopwords.words('english')]

    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]
    
    return " ".join(lemmatized_words)


In [None]:
def BOW(data):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(data)
    unique_words = vectorizer.get_feature_names_out()
    return X,unique_words



In [None]:
def tfidF(data):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data)
    unique_words = vectorizer.get_feature_names_out()
    return X


In [None]:
def lsa(data):
    vectorizer = TruncatedSVD(20, algorithm = 'arpack')
    X = vectorizer.fit_transform(data)
    X = Normalizer(copy=False).fit_transform(X)
    return X,vectorizer.components_,vectorizer.singular_values_


In [None]:
def lda(data):
    vectorizer = LatentDirichletAllocation(n_components=20, random_state=0)
    X = vectorizer.fit_transform(data)
    X = Normalizer(copy=False).fit_transform(X)
    return X,vectorizer.components_

In [None]:
BOW_features = {}
tfidf_features = {}
unique_words = []
LSA_Tfeatures = {}
LSA_Tword_topic = []
LSA_singular = []
LDA_Tfeatures = {}
LDA_Tword_topic = []


In [None]:

def tbfeature():
    cnt = 0
    BOW_list,unique_words = BOW(text_data)
    tfidf_list = tfidF(text_data)
    for community in communities:
        BOW_features[community] = {}
        tfidf_features[community]={}
        for user in users_in_community[community]:
            BOW_features[community][user] = BOW_list[cnt]
            tfidf_features[community][user] = tfidf_list[cnt]
            cnt+=1 
    return tfidf_list
    
        



In [None]:
def ldfeature(tfidf_list):
    LSA_list, LSA_Tword_topic, LSA_singular = lsa(tfidf_list)
  
    LDA_list, LDA_Tword_topic = lda(tfidf_list)
    cnt = 0
    for community in communities:
        LSA_Tfeatures[community] = {}
        LDA_Tfeatures[community] = {}
        for user in users_in_community[community]:
            LSA_Tfeatures[community][user] = LSA_list[cnt]
            LDA_Tfeatures[community][user] = LDA_list[cnt]
            
        

        

In [None]:

for community in communities:
    
    with open("Twitter_Community_Data/"+community+"_tweets.json", 'r') as f:
        file = json.load(f)
    users_in_community[community] = []
    for user in file:
        users_in_community[community].append(user)
        text = ""
        
        for tweet in file[user]:
            text += " "+file[user][tweet][0]["text"]
        
        text = clean_tweets(text)
        text_data.append(text)
        print(text)


tfidf_list = tbfeature()
ldfeature(tfidf_list)


In [47]:
# import seaborn as sns
# import matplotlib.pylab as plt


# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
# pd.set_option('display.expand_frame_repr', False)
# pd.set_option('display.max_colwidth', None)

df = pd.DataFrame([np.squeeze(BOW_features['Chess'][i].toarray()) for i in BOW_features['Chess']], index=users_in_community['Chess'])
print(df)

df = pd.DataFrame([np.squeeze(tfidf_features['Chess'][i].toarray()) for i in tfidf_features['Chess']], index=users_in_community['Chess'])
print(df)

df = pd.DataFrame([LSA_Tfeatures['Chess'][i] for i in LSA_Tfeatures['Chess']], index=users_in_community['Chess'])
print(df)

df = pd.DataFrame([LDA_Tfeatures['Chess'][i] for i in LDA_Tfeatures['Chess']], index=users_in_community['Chess'])
print(df)


            

            0      1      2      3      4      5      6      7      8      9      10     11     12     13     14     15     16     17     18     19     20     21     22     23     24     25     26     27     28     29     30     31     32     33     34     35     36     37     38     39     40     41     42     43     44     45     46     47     48     49     50     51     52     53     54     55     56     57     58     59     60     61     62     63     64     65     66     67     68     69     70     71     72     73     74     75     76     77     78     79     80     81     82     83     84     85     86     87     88     89     90     91     92     93     94     95     96     97     98     99     100    101    102    103    104    105    106    107    108    109    110    111    112    113    114    115    116    117    118    119    120    121    122    123    124    125    126    127    128    129    130    131    132    133    134    135    136    137    138    139    140    1