# SmogTower

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, f1_score, confusion_matrix, log_loss
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, PredefinedSplit

In [None]:
data_whole = pd.read_csv('/content/drive/My Drive/CSCW_H1/data/ST_labeled.csv')

# Params
n_folds=5
seed=0
ModelName = MultinomialNB

In [None]:
def anyalnum(string):
  return any(char.isalnum() for char in string)

def preprocess(tweet):
  tweet = tweet.replace('@ ','@').replace('# ','#')
  tweet = re.sub('pic.twitter.com.*','',tweet)
  tweet = re.sub('https*://[^\s]+','',tweet)
  tweet = re.sub('https*://.*','',tweet)
  #tweet = ' '.join(word for word in tweet.split() if anyalnum(word))
  return tweet

data_whole['processed_tweet'] = data_whole['tweet'].apply(preprocess)
data = data_whole.drop_duplicates('processed_tweet').reset_index(drop=True)
data.sample(1)

Unnamed: 0,tweet,Label,cashtags,conversation_id,created_at,date,day,favorites,geo,has_media,hashtags,hour,id,img_urls,is_replied,is_reply_to,likes,link,links,mentions,name,near,nlikes,nreplies,nretweets,parent_tweet_id,place,quote_url,replies,reply_to,reply_to_users,retweet,retweet_date,retweet_id,retweets,screen_name,search,source,text_html,timestamp_epochs,timezone,trans_dest,trans_src,translate,tweet_id,user_id,user_id_str,user_rt,user_rt_id,username,video_url,lang,processed_tweet
43,India's own 'smog tower' may help combat air p...,2,,,,2018-11-05 12:37,,1.0,,,,,1.06e+18,,,,,https://twitter.com/PaperDabba/status/10593415...,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,PaperDabba,,en,India's own 'smog tower' may help combat air p...


In [None]:
print('number of tweets', len(data_whole))
print('number of deduplicated tweets', len(data_whole.drop_duplicates('tweet')))
print('number of deduplicated tweets after processing', len(data))
data.Label.value_counts()

number of tweets 516
number of deduplicated tweets 446
number of deduplicated tweets after processing 430


2    285
1     82
0     63
Name: Label, dtype: int64

In [None]:
vectorizer = CountVectorizer(stop_words='english', lowercase=True)
data_dict = {i:{'train_X':None, 'test_X':None, 'val_X':None,
                'train_y':None, 'test_y':None, 'val_y':None} for i in range(n_folds)}
splitter = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
for fold, (train_val_ind, test_ind) in enumerate(splitter.split(data.index)):
  train_ind, val_ind = train_test_split(train_val_ind, random_state=seed, test_size=0.20)
  data_dict[fold]['train_X'] = vectorizer.fit_transform(data.loc[train_ind]['processed_tweet'])
  data_dict[fold]['val_X'] = vectorizer.transform(data.loc[val_ind]['processed_tweet'])
  data_dict[fold]['test_X'] = vectorizer.transform(data.loc[test_ind]['processed_tweet'])
  data_dict[fold]['train_y'] = data.loc[train_ind]['Label']
  data_dict[fold]['val_y'] = data.loc[val_ind]['Label']
  data_dict[fold]['test_y'] = data.loc[test_ind]['Label']

In [None]:
test_y_all = []
test_pred_y_all = []
parameters = {'fit_prior':(True, False), 'alpha':[0.01,0.05,0.1,0.5,1]}
model = ModelName()
for fold in range(n_folds):
  print('fold',fold)
  ######## Hyperparameter search#########################
  ps = PredefinedSplit([0 for _ in data_dict[fold]['val_X']]+[-1 for _ in data_dict[fold]['train_X']])
  clf = GridSearchCV(model, parameters, cv=ps, scoring='neg_log_loss', refit=False)
  clf.fit(data_dict[fold]['val_X'].todense().tolist()+data_dict[fold]['train_X'].todense().tolist(), 
          data_dict[fold]['val_y'].to_list()+data_dict[fold]['train_y'].to_list())
  print(clf.best_params_)
  ######### Fit-predict with best params#################
  model = ModelName(**clf.best_params_)
  model.fit(data_dict[fold]['train_X'],data_dict[fold]['train_y'])
  test_y_all.extend(data_dict[fold]['test_y'].values.tolist())
  test_pred_y_all.extend(model.predict(data_dict[fold]['test_X']))
print(classification_report(test_y_all, test_pred_y_all))

fold 0
{'alpha': 1, 'fit_prior': False}
fold 1
{'alpha': 1, 'fit_prior': False}
fold 2
{'alpha': 0.5, 'fit_prior': False}
fold 3
{'alpha': 1, 'fit_prior': False}
fold 4
{'alpha': 1, 'fit_prior': False}
              precision    recall  f1-score   support

           0       0.77      0.57      0.65        63
           1       0.51      0.46      0.49        82
           2       0.83      0.90      0.87       285

    accuracy                           0.77       430
   macro avg       0.70      0.65      0.67       430
weighted avg       0.76      0.77      0.76       430



# OddEven

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, f1_score, confusion_matrix, log_loss
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, PredefinedSplit

In [None]:
data_whole = pd.read_csv('/content/drive/My Drive/CSCW_H1/data/OE_labeled.csv')
data_extra = pd.read_csv('/content/drive/My Drive/CSCW_H1/data/OE_labeled_extra.csv')
data_whole = pd.concat([data_whole, data_extra])
data_whole = data_whole[~data_whole['Label'].isna()].reset_index(drop=True)
data_whole['Label'] = data_whole['Label'].astype(int)
print(len(data_whole))

# Params
n_folds=5
seed=0
ModelName = MultinomialNB

1098


In [None]:
def anyalnum(string):
  return any(char.isalnum() for char in string)

def preprocess(tweet):
  tweet = tweet.replace('@ ','@').replace('# ','#')
  tweet = re.sub('pic.twitter.com.*','',tweet)
  tweet = re.sub('https*://[^\s]+','',tweet)
  tweet = re.sub('https*://.*','',tweet)
  #tweet = ' '.join(word for word in tweet.split() if anyalnum(word))
  return tweet

data_whole['processed_tweet'] = data_whole['tweet'].apply(preprocess)
data = data_whole.drop_duplicates('processed_tweet').reset_index(drop=True)
data.sample(1)

Unnamed: 0.2,tweet,Label,cashtags,conversation_id,created_at,date,day,favorites,geo,has_media,hashtags,hour,id,img_urls,is_replied,is_reply_to,likes,link,links,mentions,name,near,nlikes,nreplies,nretweets,parent_tweet_id,place,quote_url,replies,reply_to,reply_to_users,retweet,retweet_date,retweet_id,retweets,screen_name,search,source,text_html,timestamp_epochs,timezone,trans_dest,trans_src,translate,tweet_id,user_id,user_id_str,user_rt,user_rt_id,username,video_url,lang,ZLabel,TLabel,RLabel,"(1, 'ZLabel')","(148, 'ZLabel')","(7, 'TLabel')","(349, 'TLabel')","(118, 'RLabel')","(665, 'RLabel')",Unnamed: 0,Unnamed: 0.1,corrected_id,pred_new,pred_old,processed_tweet
800,#Delhi govt to review #OddEven first phase tod...,1,,,,,,,,,,,,,,,,https://twitter.com/SAsiaNewsline/status/68896...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.0,6.0,6.889692e+17,0.0,0.0,#Delhi govt to review #OddEven first phase tod...


In [None]:
print('number of tweets', len(data_whole))
print('number of deduplicated tweets', len(data_whole.drop_duplicates('tweet')))
print('number of deduplicated tweets after processing', len(data))
data.Label.value_counts()

number of tweets 1098
number of deduplicated tweets 1096
number of deduplicated tweets after processing 1093


1    668
2    238
0    187
Name: Label, dtype: int64

In [None]:
vectorizer = CountVectorizer(stop_words='english', lowercase=True)
data_dict = {i:{'train_X':None, 'test_X':None, 'val_X':None,
                'train_y':None, 'test_y':None, 'val_y':None} for i in range(n_folds)}
splitter = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
for fold, (train_val_ind, test_ind) in enumerate(splitter.split(data.index)):
  train_ind, val_ind = train_test_split(train_val_ind, random_state=seed, test_size=0.20)
  data_dict[fold]['train_X'] = vectorizer.fit_transform(data.loc[train_ind]['processed_tweet'])
  data_dict[fold]['val_X'] = vectorizer.transform(data.loc[val_ind]['processed_tweet'])
  data_dict[fold]['test_X'] = vectorizer.transform(data.loc[test_ind]['processed_tweet'])
  data_dict[fold]['train_y'] = data.loc[train_ind]['Label']
  data_dict[fold]['val_y'] = data.loc[val_ind]['Label']
  data_dict[fold]['test_y'] = data.loc[test_ind]['Label']

In [None]:
test_y_all = []
test_pred_y_all = []
parameters = {'fit_prior':(True, False), 'alpha':[0.01,0.05,0.5,0.1,1]}
model = ModelName()
for fold in range(n_folds):
  print('fold',fold)
  ps = PredefinedSplit([0 for _ in data_dict[fold]['val_X']]+[-1 for _ in data_dict[fold]['train_X']])
  clf = GridSearchCV(model, parameters, cv=ps, scoring='neg_log_loss', refit=False)
  clf.fit(data_dict[fold]['val_X'].todense().tolist()+data_dict[fold]['train_X'].todense().tolist(), 
          data_dict[fold]['val_y'].to_list()+data_dict[fold]['train_y'].to_list())
  
  print(clf.best_params_)

  model = ModelName(**clf.best_params_)
  model.fit(data_dict[fold]['train_X'],data_dict[fold]['train_y'])
  test_y_all.extend(data_dict[fold]['test_y'].values.tolist())
  test_pred_y_all.extend(model.predict(data_dict[fold]['test_X']))
print(classification_report(test_y_all, test_pred_y_all))

fold 0
{'alpha': 1, 'fit_prior': False}
fold 1
{'alpha': 1, 'fit_prior': False}
fold 2
{'alpha': 1, 'fit_prior': False}
fold 3
{'alpha': 1, 'fit_prior': False}
fold 4
{'alpha': 1, 'fit_prior': False}
              precision    recall  f1-score   support

           0       0.44      0.28      0.35       187
           1       0.68      0.80      0.74       668
           2       0.47      0.37      0.42       238

    accuracy                           0.62      1093
   macro avg       0.53      0.49      0.50      1093
weighted avg       0.60      0.62      0.60      1093

