According to a study by Pear Analytics [16], about 40% of all the tweets are pointless “babbles” like “have to get something from the minimart downstairs”

In [1]:
import pickle
from os import path
import re
from nltk.tokenize import TweetTokenizer
import nltk
import csv
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import CMUTweetTagger
from sklearn.metrics.pairwise import pairwise_distances
import scipy.cluster.hierarchy as sch
import fastcluster
from collections import Counter
import codecs
from datetime import datetime
from glob import glob
import pandas as pd
import sys
import langid
from guess_language import guess_language

In [2]:
DIR_DATA = path.join('data', 'twitter data1')
DIR_GEO = path.join('data', 'geofiles')

In [3]:
# Loading the saved file is as easy as running these lines of code
#with open(path.join(DIR_DATA, 'clean_data.pkl'), 'rb') as in_file:
#    df = pickle.load(in_file)

# Read the file

In [None]:
# Read the splitted tsv files
all_files = glob(path.join(DIR_DATA, '*.tsv'))
if path.join(DIR_DATA, 'twex.tsv') in all_files:
    all_files.remove(path.join(DIR_DATA, 'twex.tsv'))

df_from_each_file = (pd.read_csv(
    file_name,
    sep="\t",
    encoding='utf-8',
    escapechar='\\',
    na_values='N',
    quoting=csv.QUOTE_NONE,
    header=None
    )
    for file_name in all_files)
print('Reading twex.tsv file...')
df = pd.concat(df_from_each_file, ignore_index=True)
print('is done!')

In [None]:
# Read the schema file
print('Reading schema.txt file...')
schema = pd.read_csv(
    path.join(DIR_DATA, 'schema.txt'),
    sep="\s+",
    header=None
)
print('is done!')

# Rename the dataframe columns
df.columns = schema[1]

In [None]:
df['text'] = df['text'].apply(str)

In [None]:
df_temp = df

In [None]:
df = df.iloc[:300000]
df.reset_index(drop=True,inplace = True)

In [None]:
# Our observations suggest that considering latitude/logitude columns is more accurate
df['latitude'].fillna(df['placeLatitude'], inplace=True)
df['longitude'].fillna(df['placeLongitude'], inplace=True)

# Just keep the important columns
df = df[['id', 'userId', 'createdAt', 'longitude', 'latitude', 'text']]

# Change the string in 'createdAt' column to datetime format
df['createdAt'] = pd.to_datetime(
    df['createdAt'],
    format='%Y-%m-%d %H:%M:%S',
    errors='coerce'
)

# Change the possible strings to numbers
df['id'] = df['id'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['userId'] = df['userId'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['longitude'] = df['longitude'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['latitude'] = df['latitude'].apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [None]:
df['language'] = np.nan
df.loc[:,'language'] =  df['text'].apply(lambda x: langid.classify(x)[0])

In [None]:
df_temp = df

In [None]:
df = df[df['language']=='en']
df.reset_index(drop=True,inplace = True)

In [None]:
# Drop rows with NaN values in important columns
df = df.dropna(subset=['id', 'userId', 'createdAt', 'longitude', 'latitude'], how='any')

# Change the id and user id format to integer 
df['id'] = df['id'].astype(np.int64)
df['userId'] = df['userId'].astype(np.int64)

# Remove duplicated tweets with the same id (it is too time consuming!)
df = df.drop_duplicates(subset='id')

# Reset index
df = df.reset_index(drop=True)

In [None]:
# Add some columns for further analysis
df['day'] = df['createdAt'].map(lambda x: x.day)
df['month'] = df['createdAt'].map(lambda x: x.month)
df['year'] = df['createdAt'].map(lambda x: x.year)
daily_user = ['userId', 'year', 'month', 'day']
df['daily_tweets'] = df.groupby(by=daily_user)['userId'].transform('count')

# Preprocessing

In [None]:
# Here we normalize the text, the code is taken from 
#https://github.com/heerme/twitter-topics/blob/master/twitter-topics-from-json-text-stream.py
def normalize_text(text):
    if type(text) is not str:
        print(text)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(pic\.twitter\.com/[^\s]+))','', text)
    text = re.sub('@[^\s]+','', text)
    text = re.sub('#([^\s]+)', '', text)
    text = re.sub('[:;>?<=*+()/,\-#!$%\{˜|\}\[^_\\@\]1234567890’‘]',' ', text)
    text = re.sub('[\d]','', text)
    text = text.replace(".", '')
    text = text.replace("'", ' ')
    text = text.replace("\"", ' ')
    #text = text.replace("-", " ")
    #normalize some utf8 encoding
    text = text.replace("\x9d",' ').replace("\x8c",' ')
    text = text.replace("\xa0",' ')
    text = text.replace("\x9d\x92", ' ').replace("\x9a\xaa\xf0\x9f\x94\xb5", ' ').replace("\xf0\x9f\x91\x8d\x87\xba\xf0\x9f\x87\xb8", ' ').replace("\x9f",' ').replace("\x91\x8d",' ')
    text = text.replace("\xf0\x9f\x87\xba\xf0\x9f\x87\xb8",' ').replace("\xf0",' ').replace('\xf0x9f','').replace("\x9f\x91\x8d",' ').replace("\x87\xba\x87\xb8",' ')	
    text = text.replace("\xe2\x80\x94",' ').replace("\x9d\xa4",' ').replace("\x96\x91",' ').replace("\xe1\x91\xac\xc9\x8c\xce\x90\xc8\xbb\xef\xbb\x89\xd4\xbc\xef\xbb\x89\xc5\xa0\xc5\xa0\xc2\xb8",' ')
    text = text.replace("\xe2\x80\x99s", " ").replace("\xe2\x80\x98", ' ').replace("\xe2\x80\x99", ' ').replace("\xe2\x80\x9c", " ").replace("\xe2\x80\x9d", " ")
    text = text.replace("\xe2\x82\xac", " ").replace("\xc2\xa3", " ").replace("\xc2\xa0", " ").replace("\xc2\xab", " ").replace("\xf0\x9f\x94\xb4", " ").replace("\xf0\x9f\x87\xba\xf0\x9f\x87\xb8\xf0\x9f", "")
    return text

In [None]:
# Find the hashtags and users
df['Hashtags'] = df['text'].apply(lambda x:{tag.strip("#") for tag in x.split() if tag.startswith("#")})
df['users'] = df['text'].apply(lambda x:{tag.strip("@") for tag in x.split() if tag.startswith("@")})

In [None]:
df.dropna(subset = ['text'],inplace=True)

In [None]:
df['processed_text'] = df['text'].apply(lambda x: normalize_text(x))
df.reset_index(inplace = True, drop = True)

In [None]:
#  filter the blank cells
filter_text = (df["processed_text"] != "") & (df["processed_text"] != " ") & (df["processed_text"] != "  ") \
    & (df["processed_text"] != "   ") 
df = df[filter_text]
df.reset_index(inplace=True,drop = True)

In [5]:
df.to_pickle(path.join(DIR_DATA, 'clean_data_Event.tsv'))

NameError: name 'df' is not defined

In [None]:
#filtering = df['year'] == 2016
#df = df[filtering]

In [None]:
df.reset_index(drop = True,inplace = True)

In [6]:
# Loading the saved file is as easy as running these lines of code
with open(path.join(DIR_DATA, 'clean_data_Event.tsv'), 'rb') as in_file:
    df = pickle.load(in_file)

In [7]:
# As there are so many tweets about the weather, we postprocess the data by removing the tweets with removing them
Weather_words = ['falling', 'CForecast','humidity','pressure','wind','temperature ', 'Conditions', 'Cloudy', 'hpa', 'humidite', 'info',\
                 'km','Rain','fog','foggy','cloudy','kmh', 'mm', 'pluie', 'pression', 'temp','Temp', 'vent']
filtering = df['processed_text'].apply(lambda x:not any(w in Weather_words for w in x.split()))
df = df[filtering]
df.reset_index(drop=True,inplace=True)

In [8]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(nltk.corpus.stopwords.words('french'))
stop_words.extend(nltk.corpus.stopwords.words('italian'))
stop_words.extend(nltk.corpus.stopwords.words('german'))

In [9]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

According to our search in the literutre we found that stemming is not that helpful, in the case one want ot do stemming, first he should detect the language (for example by langdetect package) and then use the stemmer developed for that language

In [10]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems
def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [48]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for index,row in df.iterrows():
    tex = row['processed_text']
    filtered_words = [str(word.lower()) for word in tex.split() if word.lower() not in stop_words]
    filtered_words = ' '.join(filtered_words)
    allwords_stemmed = tokenize_and_stem(filtered_words) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(filtered_words)
    totalvocab_tokenized.extend(allwords_tokenized)

In [49]:
len(totalvocab_tokenized)

361753

In [50]:
len(totalvocab_stemmed)

361753

In [51]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

there are 361753 items in vocab_frame


In [None]:
# finding the language of the texts
def find_lang(text):
    try:
        result = guess_language(text)
    except:
        result = np.nan
    return result
#df2016_noW['language'] = df2016_noW['processed_text'].apply(find_lang)  


In [None]:
frequency = df.groupby('language').count()
frequency.sort('id', ascending=False,inplace = True)

In [52]:
# As the 
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer( max_features=200000,
                                 min_df= 0.01, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_only, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(df.iloc[:10000]['processed_text']) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 1.84 s, sys: 15.5 ms, total: 1.85 s
Wall time: 1.85 s
(10000, 50)


In [53]:
terms = tfidf_vectorizer.get_feature_names() # some possible features for being a stopword

In [55]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [56]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 605 ms, sys: 2.21 ms, total: 607 ms
Wall time: 608 ms


In [57]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')

#km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [64]:
films = { 'title': df['processed_text'], 'cluster': clusters }

frame = pd.DataFrame(films, index = [clusters] , columns = ['title','processed_text', 'cluster'])

In [65]:
frame['cluster'].value_counts() #number of films per cluster (clusters from 0 to 4)

0    8397
2     503
1     470
4     397
3     233
Name: cluster, dtype: int64

In [108]:

from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0], end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d titles:" % i, end='')
    print(' %s,' % frame.loc[i]['title'].iloc[0], end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 words: juste, t, switzerland, good, like, amp,

Cluster 0 titles: The Rockefeller Foundation s Bellagio Centre A place to inspire creative thinking  ,

Cluster 1 words: nan, worked, fandom, juste, help, help,

Cluster 1 titles:  cannot find you on linkedin  but here we are,

Cluster 2 words: s, juste, today, t, good, like,

Cluster 2 titles: Great insight into the complex drivng factors of extremism  Deeyah Khan s film  Jihad a story of others   ,

Cluster 3 words: sharing, sign, sign, sharing, sign, nan,

Cluster 3 titles: Ratatatatatatatat,

Cluster 4 words: m, s, geneva, amp, juste, t,

Cluster 4 titles: In   hrs yesterday one of our local outdoor pools was transformed into a refuge camp for     people – expected soon ,





In [91]:
frame.iloc[0]['title']


'Great insight into the complex drivng factors of extremism  Deeyah Khan s film  Jihad a story of others   '

In [93]:
frame.iloc[1]['title']


'The Rockefeller Foundation s Bellagio Centre A place to inspire creative thinking  '

In [105]:
frame.loc[4]['title'].iloc[0]


'In   hrs yesterday one of our local outdoor pools was transformed into a refuge camp for     people – expected soon '

In [97]:
type(frame.loc[0]['title'])

pandas.core.series.Series