In [1]:
from IPython.display import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
sns.set(rc={'figure.figsize':(6,8)}) 

import warnings
warnings.simplefilter("ignore")

%matplotlib inline 

from sklearn.preprocessing import LabelEncoder 

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn import metrics

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.metrics import SCORERS
import re

import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.probability import FreqDist

In [2]:
def cleanTxt(txt):
    if not isinstance(txt, str):
        txt = str(txt)
    txt = re.sub(r'[^\w\s]','', txt) #remove everything except words and space
    txt = txt.lower()
    txt = re.sub(r'RT[\s]+', '', txt) #removing RT
    txt = re.sub(r'_','', txt) #how to remove underscore as well
    if 'https' in txt: # removing hyperlinks 
        pos = txt.find('https')
        txt = txt[:pos]
    return txt

In [4]:
def stopWords(txt): 
    txt_lst = txt.split(" ")
    txt_lst = [word for word in txt_lst if word not in stopwords.words('english')]
    txt_lst = [wnet.lemmatize(word) for word in txt_lst]
    txt_lst = [word for word in txt_lst if word != '']
    txt_lst = [word for word in txt_lst if word != '\r\n\r\n'] # indicates space and indexing
    txt_lst = [word for word in txt_lst if word != 'u0001f449'] # indicates back hand emoji pointing right
    txt_lst = [word for word in txt_lst if word != 'dataprivacy']
    txt_lst = [word for word in txt_lst if word != 'data']
    txt_lst = [word for word in txt_lst if word != 'privacy']
    return " ".join(txt_lst)


In [5]:
def evaluate_k_kmeans_inertia(k, vec):
    print(f"running Kmeans with k={k}")
    estimator_kmeans = KMeans(random_state=42, n_clusters=k)
    estimator_kmeans.fit(vec)
    return estimator_kmeans.inertia_

In [6]:
!pip install countrygroups



In [7]:
!pip install pycountry



In [113]:
from  countrygroups import EUROPEAN_UNION
import pycountry

def findCountry(txt):
    if txt == 'EU' or txt == 'Europe' or txt == 'European Union':
        return 'European Union'
    if txt is np.nan:
        return 'Unknown'
    txt = re.sub(',','', txt) #remove everything except words and space
    if 'global' in txt.lower() or 'worldwide' in txt.lower(): # over 100 entries with global or worldwide as location
        return 'Global'
    try:
        x = pycountry.countries.search_fuzzy(txt)[0].name
        return x
    except:
        pass
    try:
        us.states.lookup(txt)
        return "United States"
    except:
        pass
    txt_lst = txt.split(" ")
    for word in txt_lst:
        try:
            x = pycountry.subdivisions.lookup(word).country.name
            return x
        except:
            pass
        try:
            y = pycountry.countries.search_fuzzy(word)[0].name
            if y in EUROPEAN_UNION.names:
                return "European Union"
            else:
                return y
        except:
            pass
    return 'Unknown'

In [None]:
# from  countrygroups import EUROPEAN_UNION
# from geopy.geocoders import Nominatim
# from geopy.extra.rate_limiter import RateLimiter
# geolocator = Nominatim(user_agent = "geoapiExercises")
# geocode = RateLimiter(geolocator.geocode, max_retries=2, swallow_exceptions=True, return_value_on_exception=None)
# def findCountry(txt):
#     if txt is np.nan:
#         return 'Unknown'
#     txt = re.sub(r'[^\w\s]','', txt) #remove everything except words and space
#     if 'global' in txt.lower() or 'worldwide' in txt.lower(): # over 100 entries with global or worldwide as location
#         return 'Global'
#     txt_lst = txt.split(" ")
#     for word in txt_lst:  # a few entries that return unknown when a number is present (regex doesn't remove it for some reason)
#         if any(ch.isdigit() for ch in word):
#             txt_lst.remove(word)
#     txt = " ".join(txt_lst)
#     try:
#         x = geolocator.geocode(txt, )
#         if x is None:
#             return 'Unknown'
#         loc = x.address.split(',')[-1]
#         if loc[0] == " ":
#             loc = loc[1:]
#         if loc in EUROPEAN_UNION.names:
#              return "European Union"
#         return loc
#     except:
#         return 'Unknown'

In [9]:
def splitData(data, len_of_frame):
    n = len(data.index) 
    split_n = round(n/len_of_frame, 0)
    print(split_n)
    splits = [int(x) for x in range(int(split_n))]
    lst = []
    for splt in splits:
        if splt == splits[-1]:
            lst.append(data[int(splt * len_of_frame):])
        else:
            lst.append(data[int(splt * len_of_frame):int(len_of_frame * (1 + splt))])
    return lst

In [10]:
def cluster_summary(ci, df):
    cluster = df[df.cluster_id==ci]
    cluster_summary = cluster[categorical_data.columns].mode().to_dict(orient="records")[0]
    cluster_summary.update(cluster.mean().to_dict())
    return cluster_summary

In [11]:
# we can make a function that puts comparisons side by side so its easier for us to see in a data frame 

def cluster_comparison(*cluster_ids):
    summaries = []
    for cluster_id in cluster_ids:
        summaries.append(cluster_summary(cluster_id, df2))
    return pd.DataFrame(summaries).set_index("cluster_id").T

In [12]:
def kmeans_cluster(df, n_clusters=2):
    """This function assigns clusters to every row in the dataframe via kmeans"""
    model = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = model.fit_predict(df)
    cluster_results = df.copy()
    cluster_results['Cluster'] = clusters
    return cluster_results

In [13]:
def graph_summary_clustering(results):
    """this function produces a summary of the clusters"""
    cluster_size = results.groupby(['Cluster']).size().reset_index()
    cluster_size.columns = ['Cluster', 'Count']
    cluster_means = results.groupby(['Cluster'], as_index=False).mean()
    cluster_summary = pd.merge(cluster_size, cluster_means, on='Cluster')
    cluster_summary = cluster_summary.drop(["Count"], axis=1).set_index("Cluster")
    return cluster_summary[sorted(cluster_summary.columns)]

# Data Privacy clustering on other columns: preprocessing

In [21]:
data = pd.read_csv('data_privacy_tweets.csv')

In [22]:
data = data.drop_duplicates('text')
data.head()

Unnamed: 0,user_id,status_id,created_at,screen_name,text,source,display_text_width,reply_to_status_id,reply_to_user_id,reply_to_screen_name,...,statuses_count,favourites_count,account_created_at,verified,profile_url,profile_expanded_url,account_lang,profile_banner_url,profile_background_url,profile_image_url
0,x1166589525386301440,x1404485195676459008,2021-06-14 17:05:39,privacy_issues,The @EFF finds that the proposed modifications...,TweetDeck,248,,,,...,1036,2177,2019-08-28 05:53:33,False,https://t.co/HFMGW8U18M,http://bit.ly/privacyissuessignup,,https://pbs.twimg.com/profile_banners/11665895...,,http://pbs.twimg.com/profile_images/1166589644...
1,x1166589525386301440,x1403010409473449986,2021-06-10 15:25:22,privacy_issues,The European Parliament passes the new #EU #CO...,TweetDeck,223,,,,...,1036,2177,2019-08-28 05:53:33,False,https://t.co/HFMGW8U18M,http://bit.ly/privacyissuessignup,,https://pbs.twimg.com/profile_banners/11665895...,,http://pbs.twimg.com/profile_images/1166589644...
2,x1166589525386301440,x1404185087093182465,2021-06-13 21:13:07,privacy_issues,To gain better control over the increasingly p...,TweetDeck,240,,,,...,1036,2177,2019-08-28 05:53:33,False,https://t.co/HFMGW8U18M,http://bit.ly/privacyissuessignup,,https://pbs.twimg.com/profile_banners/11665895...,,http://pbs.twimg.com/profile_images/1166589644...
3,x1166589525386301440,x1401833811940306947,2021-06-07 09:30:00,privacy_issues,What if the consumers are on the driving seat ...,TweetDeck,255,,,,...,1036,2177,2019-08-28 05:53:33,False,https://t.co/HFMGW8U18M,http://bit.ly/privacyissuessignup,,https://pbs.twimg.com/profile_banners/11665895...,,http://pbs.twimg.com/profile_images/1166589644...
4,x1166589525386301440,x1401555697163280387,2021-06-06 15:04:52,privacy_issues,"""Vera Jourová says new rules are needed to lim...",TweetDeck,230,,,,...,1036,2177,2019-08-28 05:53:33,False,https://t.co/HFMGW8U18M,http://bit.ly/privacyissuessignup,,https://pbs.twimg.com/profile_banners/11665895...,,http://pbs.twimg.com/profile_images/1166589644...


## Location to country

In [23]:
df1 = splitData(data, 1700)[0]
df2 = splitData(data, 1700)[1]
df3 = splitData(data, 1700)[2]
df4 = splitData(data, 1700)[3]
df5 = splitData(data, 1700)[4]
df6 = splitData(data, 1700)[5]
# the numbers just indicates how many data frames there are
# it gets printed everything the function is run

6.0
6.0
6.0
6.0
6.0
6.0


In [118]:
%%time 
df1['country'] = df1['location'].apply(findCountry)

CPU times: user 50.3 s, sys: 89.1 ms, total: 50.3 s
Wall time: 50.5 s


In [119]:
df1['location']

0                   EU
1                   EU
2                   EU
3                   EU
4                   EU
             ...      
1770    Carrollton, TX
1772               NaN
1773               NaN
1774               NaN
1775               NaN
Name: location, Length: 1700, dtype: object

In [120]:
df1['country']

0       European Union
1       European Union
2       European Union
3       European Union
4       European Union
             ...      
1770     United States
1772           Unknown
1773           Unknown
1774           Unknown
1775           Unknown
Name: country, Length: 1700, dtype: object

In [None]:
%%time 
df2['country'] = df2['location'].apply(findCountry)

In [None]:
%%time
df4['country'] = df4['location'].apply(findCountry)

In [None]:
%%time
df5['country'] = df5['location'].apply(findCountry)

In [None]:
%%time
df6['country'] = df6['location'].apply(findCountry)

In [None]:
data = pd.concat([df1, df2, df3, df4, df5, df6])
data[['country']].head()

In [None]:
data[["created_at"]] = data[["created_at"]].apply(pd.to_datetime)
data['day_post_created'] = pd.DatetimeIndex(data["created_at"]).day
data['year_account_created'] = pd.DatetimeIndex(data["account_created_at"]).year
data['year_account_created'] = data["year_account_created"].astype("category")

In [None]:
data_processed = data.copy()
data_processed.to_csv('datapriv_processed.csv', index = True)

# Analysis

In [None]:
df2 = pd.read_csv('datapriv_processed.csv')

In [None]:
df2 = df2[["country", "is_quote", "retweet_count", "day_post_created", "followers_count", "friends_count", 
              "listed_count", "verified", "year_account_created"]]
df2.head()

In [None]:
df2['followers_count'] = df2['followers_count'].astype("float64")
df2['friends_count'] = df2['friends_count'].astype("float64")
df2['retweet_count'] = df2['retweet_count'].astype("float64")
df2['day_post_created'] = df2['day_post_created'].astype("float64")
df2['verified'] = df2['verified'].astype("object")

In [None]:
df2['country'].value_counts()
# top 5: unknown, united states, united kingdom, visayas (philippines), india

In [None]:
df2.dropna(inplace=True)

In [None]:
df2.isna().sum()

In [None]:
df2['is_quote'] = df2['is_quote'].astype("object")
df2['year_account_created'] = df2['year_account_created'].astype("object")

In [None]:
#df2 = df2.drop('year_account_created', axis = 1)

In [None]:
df2.dtypes

In [None]:
numerical_data = df2.select_dtypes(np.number)
categorical_data = df2.drop(numerical_data, axis = 1)

### Processing Data

In [None]:
from sklearn.preprocessing import MinMaxScaler

numerical_data_normalized = MinMaxScaler().fit_transform(numerical_data) # this line scales our data

numerical_data_normalized = pd.DataFrame(              #this pops it back in data frame format 
    numerical_data_normalized,
    columns=numerical_data.columns) 

numerical_data_normalized.head()

In [None]:
categorical_data_codified = pd.get_dummies(
                                    categorical_data, 
                                    drop_first=True,
                                    dtype="int64"
).reset_index()
categorical_data_codified.head()

In [None]:
categorical_data_codified = categorical_data_codified[['country_Unknown', 'country_United States', 
                                                       'country_United Kingdom', 'country_Visayas', 
                                                       'country_India', 'is_quote_True', 'verified_True', 
                                                       'year_account_created_2007','year_account_created_2008',
                                                       'year_account_created_2009','year_account_created_2010',
                                                       'year_account_created_2011','year_account_created_2012','year_account_created_2013',
                                                      'year_account_created_2014', 'year_account_created_2015',
                                                      'year_account_created_2016', 'year_account_created_2017',
                                                      'year_account_created_2018', 'year_account_created_2019',
                                                      'year_account_created_2020', 'year_account_created_2021']]

In [None]:
df2_processed = pd.concat([
                            numerical_data_normalized,
                            categorical_data_codified
                        ], axis=1
).reset_index()

In [None]:
df2_processed = df2_processed.drop('index', axis = 1)


In [None]:
df2_processed.head()


### Finding K

In [None]:
%%time 
range_k = [2, 3, 4, 5, 8, 10, 15, 20, 25, 30]
results_k = {}
for k in range_k:
    results_k[k] = evaluate_k_kmeans_inertia(k, df2_processed)

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax = sns.lineplot(
    [c[0] for c in results_k.items()],
    [c[1] for c in results_k.items()], label="inertia", color="red")
ax.set_xlabel("K")
ax.set_ylabel("inertia")
ax.set_title("Inertia by K")
ax.legend();

### Cluster Analysis

In [None]:
estimator_kmeans = KMeans(random_state=42, n_clusters=10) # previously 8 with account created
estimator_kmeans.fit(df2_processed)

In [None]:
df2["cluster_id"] = estimator_kmeans.labels_
df2.head()

In [None]:
df2.cluster_id.value_counts()

In [None]:
cluster_summary(1, df2)

In [None]:
cluster_comparison(0,1,2,3,4,5,6,7,8,9)

In [None]:
cluster_results = kmeans_cluster(df2_processed, 10)
cluster_summary2 = graph_summary_clustering(cluster_results)

In [None]:
#cluster_summary2 = cluster_summary2.drop('cluster_id', axis = 1)
cluster_summary2

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(cluster_summary2.transpose(), annot=False)
# the cluster looks better than the previous one on only the tweets