According to a study by Pear Analytics [16], about 40% of all the tweets are pointless “babbles” like “have to get something from the minimart downstairs”

In [1]:
import pickle
from os import path
import re
from nltk.tokenize import TweetTokenizer
import nltk
import csv
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import CMUTweetTagger
from sklearn.metrics.pairwise import pairwise_distances
import scipy.cluster.hierarchy as sch
import fastcluster
from collections import Counter
import codecs
from datetime import datetime
from glob import glob
import pandas as pd
import sys
import langid

In [2]:
DIR_DATA = path.join('data', 'twitter data1')
DIR_GEO = path.join('data', 'geofiles')

In [3]:
# Loading the saved file is as easy as running these lines of code
#with open(path.join(DIR_DATA, 'clean_data.pkl'), 'rb') as in_file:
#    df = pickle.load(in_file)

# Read the file

In [4]:
# Read the splitted tsv files
all_files = glob(path.join(DIR_DATA, '*.tsv'))
if path.join(DIR_DATA, 'twex.tsv') in all_files:
    all_files.remove(path.join(DIR_DATA, 'twex.tsv'))

df_from_each_file = (pd.read_csv(
    file_name,
    sep="\t",
    encoding='utf-8',
    escapechar='\\',
    na_values='N',
    quoting=csv.QUOTE_NONE,
    header=None
    )
    for file_name in all_files)
print('Reading twex.tsv file...')
df = pd.concat(df_from_each_file, ignore_index=True)
print('is done!')

Reading twex.tsv file...
is done!


In [5]:
# Read the schema file
print('Reading schema.txt file...')
schema = pd.read_csv(
    path.join(DIR_DATA, 'schema.txt'),
    sep="\s+",
    header=None
)
print('is done!')

# Rename the dataframe columns
df.columns = schema[1]

Reading schema.txt file...
is done!


In [6]:
# Our observations suggest that considering latitude/logitude columns is more accurate
df['latitude'].fillna(df['placeLatitude'], inplace=True)
df['longitude'].fillna(df['placeLongitude'], inplace=True)

# Just keep the important columns
df = df[['id', 'userId', 'createdAt', 'longitude', 'latitude', 'text']]

# Change the string in 'createdAt' column to datetime format
df['createdAt'] = pd.to_datetime(
    df['createdAt'],
    format='%Y-%m-%d %H:%M:%S',
    errors='coerce'
)

# Change the possible strings to numbers
df['id'] = df['id'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['userId'] = df['userId'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['longitude'] = df['longitude'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['latitude'] = df['latitude'].apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [7]:
# Drop rows with NaN values in important columns
df = df.dropna(subset=['id', 'userId', 'createdAt', 'longitude', 'latitude'], how='any')

# Change the id and user id format to integer 
df['id'] = df['id'].astype(np.int64)
df['userId'] = df['userId'].astype(np.int64)

# Remove duplicated tweets with the same id (it is too time consuming!)
df = df.drop_duplicates(subset='id')

# Reset index
df = df.reset_index(drop=True)

In [8]:
# Add some columns for further analysis
df['day'] = df['createdAt'].map(lambda x: x.day)
df['month'] = df['createdAt'].map(lambda x: x.month)
df['year'] = df['createdAt'].map(lambda x: x.year)
daily_user = ['userId', 'year', 'month', 'day']
df['daily_tweets'] = df.groupby(by=daily_user)['userId'].transform('count')

In [9]:
df.sort_values(by='createdAt', ascending=1, inplace = True)

# Preprocessing

In [10]:
# Here we normalize the text, the code is taken from 
#https://github.com/heerme/twitter-topics/blob/master/twitter-topics-from-json-text-stream.py
def normalize_text(text):
    if type(text) is not str:
        print(text)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(pic\.twitter\.com/[^\s]+))','', text)
    text = re.sub('@[^\s]+','', text)
    text = re.sub('#([^\s]+)', '', text)
    text = re.sub('[:;>?<=*+()/,\-#!$%\{˜|\}\[^_\\@\]1234567890’‘]',' ', text)
    text = re.sub('[\d]','', text)
    text = text.replace(".", '')
    text = text.replace("'", ' ')
    text = text.replace("\"", ' ')
    #text = text.replace("-", " ")
    #normalize some utf8 encoding
    text = text.replace("\x9d",' ').replace("\x8c",' ')
    text = text.replace("\xa0",' ')
    text = text.replace("\x9d\x92", ' ').replace("\x9a\xaa\xf0\x9f\x94\xb5", ' ').replace("\xf0\x9f\x91\x8d\x87\xba\xf0\x9f\x87\xb8", ' ').replace("\x9f",' ').replace("\x91\x8d",' ')
    text = text.replace("\xf0\x9f\x87\xba\xf0\x9f\x87\xb8",' ').replace("\xf0",' ').replace('\xf0x9f','').replace("\x9f\x91\x8d",' ').replace("\x87\xba\x87\xb8",' ')	
    text = text.replace("\xe2\x80\x94",' ').replace("\x9d\xa4",' ').replace("\x96\x91",' ').replace("\xe1\x91\xac\xc9\x8c\xce\x90\xc8\xbb\xef\xbb\x89\xd4\xbc\xef\xbb\x89\xc5\xa0\xc5\xa0\xc2\xb8",' ')
    text = text.replace("\xe2\x80\x99s", " ").replace("\xe2\x80\x98", ' ').replace("\xe2\x80\x99", ' ').replace("\xe2\x80\x9c", " ").replace("\xe2\x80\x9d", " ")
    text = text.replace("\xe2\x82\xac", " ").replace("\xc2\xa3", " ").replace("\xc2\xa0", " ").replace("\xc2\xab", " ").replace("\xf0\x9f\x94\xb4", " ").replace("\xf0\x9f\x87\xba\xf0\x9f\x87\xb8\xf0\x9f", "")
    return text

In [11]:
df['text'] = df['text'].apply(str)

In [12]:
# Find the hashtags and users
df['Hashtags'] = df['text'].apply(lambda x:{tag.strip("#") for tag in x.split() if tag.startswith("#")})
df['users'] = df['text'].apply(lambda x:{tag.strip("@") for tag in x.split() if tag.startswith("@")})

In [13]:
df.dropna(subset = ['text'],inplace=True)

In [14]:
df['processed_text'] = df['text'].apply(lambda x: normalize_text(x))
df.reset_index(inplace = True, drop = True)

In [15]:
#  filter the blank cells
filter_text = (df["processed_text"] != "") & (df["processed_text"] != " ") & (df["processed_text"] != "  ") \
    & (df["processed_text"] != "   ") 
df = df[filter_text]
df.reset_index(inplace=True,drop = True)

In [16]:
filtering = df['year'] == 2016
df2016 = df[filtering]

In [17]:
df2016.reset_index(drop = True,inplace = True)

In [18]:
# As there are so many tweets about the weather, we postprocess the data by removing the tweets with removing them
Weather_words = ['falling', 'CForecast','humidity','pressure','wind','temperature ', 'Conditions', 'Cloudy', 'hpa', 'humidite', 'info',\
                 'km','Rain','fog','foggy','cloudy','kmh', 'mm', 'pluie', 'pression', 'temp','Temp', 'vent']
filtering = df2016['processed_text'].apply(lambda x:not any(w in Weather_words for w in x.split()))
df2016_noW = df2016[filtering]
df2016_noW.reset_index(drop=True,inplace=True)

In [19]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(nltk.corpus.stopwords.words('french'))
stop_words.extend(nltk.corpus.stopwords.words('italian'))
stop_words.extend(nltk.corpus.stopwords.words('german'))

According to our search in the literutre we found that stemming is not that helpful, in the case one want ot do stemming, first he should detect the language (for example by langdetect package) and then use the stemmer developed for that language

In [20]:
def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [21]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_tokenized = []
for index,row in df2016_noW.iloc[:300000].iterrows():
    tex = row['processed_text']
    allwords_tokenized = tokenize_only(tex)
    totalvocab_tokenized.extend(allwords_tokenized)

In [None]:
# The code is taken from
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords



#----------------------------------------------------------------------
def _calculate_languages_ratios(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Dictionary with languages and unique stopwords seen in analyzed text
    @rtype: dict
    """

    languages_ratios = {}

    '''
    nltk.wordpunct_tokenize() splits all punctuations into separate tokens
    
    >>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.")
    ['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.']
    '''

    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios


#----------------------------------------------------------------------
def detect_language(text):
    """
    Calculate probability of given text to be written in several languages and
    return the highest scored.
    
    It uses a stopwords based approach, counting how many unique stopwords
    are seen in analyzed text.
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Most scored language guessed
    @rtype: str
    """

    ratios = _calculate_languages_ratios(text)

    most_rated_language = max(ratios, key=ratios.get)

    return most_rated_language



    language = detect_language(text)



In [None]:
# finding the language of the texts
df2016_noW['language']= df2016_noW['processed_text'].apply(lambda x: langid.classify(x))
df2016_noW['language'] = df2016_noW['language'].apply(lambda x:x[0])

In [None]:
#Frequency of the words
x = df2016_noW.groupby('language').count()
test = x.sort('id', ascending=False)

In [None]:
test[:5]

In [130]:
filtering = df2016_noW['language'] == 'en'
df2016_noW_eng = df2016_noW[filtering]

In [169]:
# As the 
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer( max_features=200000,
                                 min_df= 0.01, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_only, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(df2016_noW_eng.iloc[:300000]['processed_text']) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 1min, sys: 723 ms, total: 1min
Wall time: 1min 1s
(300000, 32)


In [170]:
terms = tfidf_vectorizer.get_feature_names() # some possible features for being a stopword

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [171]:
terms

['amp',
 'best',
 'd',
 'day',
 'don',
 'don t',
 'geneva',
 'good',
 'great',
 'happy',
 'just',
 'just posted',
 'know',
 'like',
 'love',
 'm',
 'morning',
 'nan',
 'need',
 'new',
 'people',
 'photo',
 'posted',
 's',
 'switzerland',
 't',
 'thank',
 'thanks',
 'time',
 'today',
 'want',
 'world']