# Language Filtering

User [Langid](https://github.com/saffsd/langid.py) to classify tweets that got past our SQL query.


Postgres -> Pandas:
> dataframe = psql.read_sql("SELECT * FROM ", connection)

In [68]:
import os
from dotenv import find_dotenv, load_dotenv
import psycopg2 as pg
import pandas.io.sql as psql
import pandas as pd
import time

In [2]:
# Load variables from .env, connect to DB
load_dotenv(find_dotenv())
database_url = os.environ.get('DATABASE_URL')

conn = pg.connect(database_url)
curr = conn.cursor()

Count how many tweets we have total.

### Language

> SELECT COUNT(CustomerID), Country
>
> FROM Customers
>
> GROUP BY Country
> 
> ORDER BY COUNT(CustomerID) DESC

In [3]:
# count languages
# dataframe = psql.read_sql("SELECT * FROM ", connection)

lang_sql = """SELECT language, COUNT(language) FROM raw_tweets GROUP BY language ORDER BY COUNT(language) DESC;"""
lang_df = psql.read_sql(lang_sql, conn)

print ('Unique langs: {}'.format(lang_df['language'].unique()))
lang_df.head(30)

Unique langs: ['en' 'ar' 'fr' 'es' 'id' 'en-gb' 'de' 'tr' 'nl' 'ru' 'it' 'pt' 'en-GB'
 'sv' 'ja' 'ca' 'pl' 'he' 'da' 'th' 'fi' 'ko' 'hi' 'el' 'no' 'fa' 'zh-cn'
 'cs' 'hu' 'msa' 'en-AU' 'ro' 'zh-CN' 'hr' 'gl' 'xx-lc' 'vi' 'zh-tw' 'eu'
 'nb' 'sr' 'uk' 'ur' 'zh-TW' 'en-IN' 'lv' 'bg' 'es-MX' 'sk' 'in' 'fil'
 'ga' 'ms' 'ta' 'bn' 'kn' 'mr' 'zh-Hans' 'cy' 'en-US' 'pt-PT' 'gu' 'fr-CA'
 'sq' 'pa' 'sl' 'zh-Hant' 'af' 'sw' 'lt' 'zh-HK' 'EN' 'en-SS' 'TR' 'lolc'
 'be' 'az' 'ckb' 'et' 'so' 'en-CA' 'zh' 'ae' 'ku' 'ld' 'pt-BR' 'is' 'zz'
 'de-AT' 'mk' 'ml' 'mn' 'uz' 'ka' 'co' 'ht' 'to' 'am' 'en-ss' 'haw' 'jv'
 'nl-BE' 'gsw' 'bs' 'my' 'ig' 'us' 'az-Cyrl' 'de-CH' 'nap' 'pa-Arab' None]


Unnamed: 0,language,count
0,en,233624341
1,ar,176246762
2,fr,13745440
3,es,10557995
4,id,8175217
5,en-gb,4442070
6,de,3053145
7,tr,2504156
8,nl,2417177
9,ru,1110983


In [4]:
lang_df.language.sort_values().unique()

array(['EN', 'TR', 'ae', 'af', 'am', 'ar', 'az', 'az-Cyrl', 'be', 'bg',
       'bn', 'bs', 'ca', 'ckb', 'co', 'cs', 'cy', 'da', 'de', 'de-AT',
       'de-CH', 'el', 'en', 'en-AU', 'en-CA', 'en-GB', 'en-IN', 'en-SS',
       'en-US', 'en-gb', 'en-ss', 'es', 'es-MX', 'et', 'eu', 'fa', 'fi',
       'fil', 'fr', 'fr-CA', 'ga', 'gl', 'gsw', 'gu', 'haw', 'he', 'hi',
       'hr', 'ht', 'hu', 'id', 'ig', 'in', 'is', 'it', 'ja', 'jv', 'ka',
       'kn', 'ko', 'ku', 'ld', 'lolc', 'lt', 'lv', 'mk', 'ml', 'mn', 'mr',
       'ms', 'msa', 'my', 'nap', 'nb', 'nl', 'nl-BE', 'no', 'pa',
       'pa-Arab', 'pl', 'pt', 'pt-BR', 'pt-PT', 'ro', 'ru', 'sk', 'sl',
       'so', 'sq', 'sr', 'sv', 'sw', 'ta', 'th', 'to', 'tr', 'uk', 'ur',
       'us', 'uz', 'vi', 'xx-lc', 'zh', 'zh-CN', 'zh-HK', 'zh-Hans',
       'zh-Hant', 'zh-TW', 'zh-cn', 'zh-tw', 'zz', None], dtype=object)

In [5]:
lang_df.iloc[:]

Unnamed: 0,language,count
0,en,233624341
1,ar,176246762
2,fr,13745440
3,es,10557995
4,id,8175217
5,en-gb,4442070
6,de,3053145
7,tr,2504156
8,nl,2417177
9,ru,1110983


In [6]:
# count languages from filtered tweets
# dataframe = psql.read_sql("SELECT * FROM ", connection)

filter_lang_sql = """SELECT language, COUNT(language) FROM filter_tweets GROUP BY language ORDER BY COUNT(language) DESC;"""
filter_lang_df = psql.read_sql(filter_lang_sql, conn)

print ('Unique langs: {}'.format(filter_lang_df['language'].unique()))
filter_lang_df.head(10)

Unique langs: ['en' 'en-gb' 'en-GB' 'en-AU' 'en-IN' 'en-US' 'en-CA' 'en-SS']


Unnamed: 0,language,count
0,en,17006754
1,en-gb,420433
2,en-GB,43971
3,en-AU,2252
4,en-IN,74
5,en-US,24
6,en-CA,1
7,en-SS,1


---

#### Messages fit in memory?

In [9]:
!cat ../src/data/createdb.sql

CREATE TABLE IF NOT EXISTS "raw_tweets" (
    "id" SERIAL NOT NULL,
    "tweetID" BIGINT NOT NULL,
    "date" TIMESTAMP,
    "message" TEXT,
    "username" TEXT,
    "userID" BIGINT NOT NULL,
    "language" VARCHAR(10),
    "longitude" FLOAT,
    "latitude" FLOAT,
    "retweet" TEXT,
    CONSTRAINT raw_tweets_pk PRIMARY KEY ("id")
) WITH ( OIDS=FALSE );
CREATE TABLE IF NOT EXISTS "filter_tweets" (
    "id" SERIAL NOT NULL,
    "tweetID" BIGINT NOT NULL,
    "date" TIMESTAMP,
    "message" TEXT,
    "username" TEXT,
    "userID" BIGINT NOT NULL,
    "language" VARCHAR(10),
    "longitude" FLOAT,
    "latitude" FLOAT,
    "retweet" TEXT
) WITH ( OIDS=FALSE );
ALTER TABLE "filter_tweets" ADD CONSTRAINT "filter_fk0" FOREIGN KEY ("id") REFERENCES "raw_tweets"("id");


In [8]:
#dataframe = psql.read_sql("SELECT * FROM ", connection)
messages_query = """SELECT "tweetID", "message" FROM filter_tweets;  """
messages_df = psql.read_sql(messages_query, con=conn)

print ('mem usage:\t{}'.format(messages_df.memory_usage()))
messages_df.head(20)

mem usage:	Index             80
tweetID    139788080
message    139788080
dtype: int64


Unnamed: 0,tweetID,message
0,826251002722521089,It's getting really frustrating that my school...
1,826251003297140736,thisisBS this taxpayer DOESNOT WANT TO PAY FO...
2,826251004370890752,LIVE NOW: Senate hearing on Jack Lam and the B...
3,826251004832256000,Ridiculous!
4,826251005352357892,Acting Attorney General Orders Justice Dept. N...
5,826251005683642368,Hill staffers secretly worked on Trump's immig...
6,826251006191116288,refugee programs ? Send them to Guatamala;mexi...
7,826251007831187457,Interesting: how do I cancel all my services?
8,826251007889907714,If you mean it; stop advertising on the hate s...
9,826251007982182400,. you coward! What is your stance on the refu...


### Language Identifier

1. Filter emoticons, links, etc from messages. (*Don't lemmatize or stem, as we want as much natural text as possible for the LangID classifier*)
2. Create two columns as the langID classified language and probability.
3. Gather tweetID's of those who are not English tweets. 
4. Save to file???
5. Drop rows from filter `where tweetID IS IN VALUES([...])`

In [53]:
#%run ../src/utils/twokenize.py
%run ../src/utils/process_twitter_langid.py

import langid

In [27]:
messages_df['message'].iloc[0]

"It's getting really frustrating that my school has almost no other Muslims and I'm pretty sure I'm the only Muslim in my department"

In [28]:
langid.classify("It's getting really frustrating that my school has almost no other Muslims and I'm pretty sure I'm the only Muslim in my department")

('en', -337.46013283729553)

In [55]:
import numpy as np
from multiprocessing import cpu_count, Pool
 
cores = 8 #Number of CPU cores on your system
partitions = cores #Define as many partitions as you want
 
def parallelize(data, func):
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

In [45]:
test_df = messages_df.iloc[:1000]

# test_df['langid'], test_df['prob'] = parallelize(test_df['message'], langid.classify)

# test_df['langid'] = parallelize(test_df['message'], langid.classify)

#test_df['langid'], test_df['prob'] = test_df['message'].apply(lambda x: langid.classify(x))
# too many values to unpack (expected 2)

test_df['langid'] = test_df['message'].apply(lambda x: langid.classify(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [62]:
def langid_classifier(x):
    cleaned = clean_tweet(x)
    return langid.classify(cleaned)[0]

test_df['langid'] = test_df['message'].apply(lambda x: langid_classifier(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [63]:
test_df.head(20)

Unnamed: 0,tweetID,message,langid
0,826251002722521089,It's getting really frustrating that my school...,en
1,826251003297140736,thisisBS this taxpayer DOESNOT WANT TO PAY FO...,en
2,826251004370890752,LIVE NOW: Senate hearing on Jack Lam and the B...,en
3,826251004832256000,Ridiculous!,en
4,826251005352357892,Acting Attorney General Orders Justice Dept. N...,en
5,826251005683642368,Hill staffers secretly worked on Trump's immig...,en
6,826251006191116288,refugee programs ? Send them to Guatamala;mexi...,en
7,826251007831187457,Interesting: how do I cancel all my services?,en
8,826251007889907714,If you mean it; stop advertising on the hate s...,en
9,826251007982182400,. you coward! What is your stance on the refu...,en


In [66]:
test_df[test_df.langid != 'en'].tweetID.tolist()

[826251008212865026,
 826251008271667200,
 826251012532998144,
 826251014416248832,
 826251023622733827,
 826251028521693184,
 826251036155322368,
 826251038382501892,
 826251046460612608,
 826251058028625921,
 826251060788477952,
 826251067864276994,
 826251070573727744,
 826251073413382146,
 826251074742915073,
 826251080514338817,
 826251084997939201,
 826251085006327809,
 826251086730231808,
 826251087610933248,
 826251090819739648,
 826251091620671488,
 826251101859115008,
 826251103239041024,
 826251107970273281,
 826251109656363008,
 826251110671380484,
 826251112747585536,
 826251114622418944,
 826251118619553792,
 826251124495810562,
 826251125498245120,
 826251126370684929,
 826251128652173312,
 826251129038262272,
 826251130661396485,
 826251134239174656,
 826251135686152192,
 826251136759894016,
 826251137321799685,
 826251144057888769,
 826251144485863425,
 826251145840582656,
 826251155625877507,
 826251159182663680,
 826251168317833221,
 826251171010588672,
 826251173011

In [None]:
start = time.time()
messages_df['langid'] = messages_df['message'].apply(lambda x: langid_classifier(x))
end = time.time()

print ('Elapsed:\t{}'.format(end - start))