<a href="https://colab.research.google.com/github/sandhyaparna/NLP/blob/main/Dell_Take_Home.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import Libraries
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np
from google.colab import files
import json
import seaborn as sns
import matplotlib.pyplot as plt
import string, re, os
import collections
from collections import defaultdict, Counter, OrderedDict

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
more_stopwords = ['i']
stop_words = stop_words + more_stopwords

import spacy
nlp = spacy.load('en', parse = False, tag=False, entity=False)

from wordcloud import WordCloud
import textblob
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)

In [3]:
print("upload boston_bombing_tweets - Home test Data Scientist.csv file")
tweets_uploaded = files.upload()

upload boston_bombing_tweets - Home test Data Scientist.csv file


Saving boston_bombing_tweets - Home test Data Scientist.csv to boston_bombing_tweets - Home test Data Scientist.csv


In [4]:
print("upload replacements_dict.json file")
dict_uploaded = files.upload()

upload replacements_dict.json file


Saving replacements_dict.json to replacements_dict.json


In [5]:
import io
boston_bombing_tweets = pd.read_csv(io.BytesIO(tweets_uploaded['boston_bombing_tweets - Home test Data Scientist.csv']))
print(f'Tweets data is loaded')

tweets_train = boston_bombing_tweets[pd.notnull(boston_bombing_tweets.label)]
print("Number of observations in train data are", tweets_train.shape[0])

tweets_test = boston_bombing_tweets[pd.isnull(boston_bombing_tweets.label)]
print("Number of observations in test data are", tweets_test.shape[0])

Tweets data is loaded
Number of observations in train data are 4000
Number of observations in test data are 12218


In [6]:
with open('replacements_dict.json') as json_file:
  replacements_dict = json_file.readlines()
  
replacements_dict = map(lambda x: x.rstrip(), replacements_dict)
replacements_dict = ' '.join(replacements_dict)
replacements_dict = json.loads(replacements_dict, object_pairs_hook=OrderedDict)
print("replacements dictionary is loaded")

replacements dictionary is loaded


In [7]:
# helper functions
def replace_all(text, dic=replacements_dict):
    for i, j in dic.items():
        text = re.sub(i, j, text)
    return text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text
    
def text_preprocess(text):
  text = re.sub('[%s]' % re.escape(string.punctuation.replace('#','')), '', text)
  text = re.sub('^# |# ', '', text)
  text = re.sub('[^\u0000-\u007e]+', '', text)
  text = ' '.join(word for word in text.split(' ') if word not in stop_words)
  return text

def create_corpus(data, text_var, condition):
    corpus=[]
    
    if condition == "hash":
      for x in data['text_processed'].apply(lambda x: re.findall(r'(?<=)#\w+',x)):
          for i in x:
            corpus.append(i)
    
    else:  # non-hash words
      for x in data['text_processed'].apply(lambda x: re.findall(r'(?#)\w+',x)):
        for i in x:
          corpus.append(i)

    return corpus

def get_top_tweet_ngrams(corpus, n):
    vec = CountVectorizer(ngram_range=(n, n), token_pattern=r'\b\w\w+\b|(?<!\w)#\w+').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:None]

In [8]:
# pre-processing function
def preprocessing_fn(data, text_var):
  """
  delete missing values, 

  """
  data = data[pd.notnull(data[text_var])]
  data[text_var+'_processed'] = data[text_var].str.lower()
  data[text_var+'_URL_present'] = np.where(data[text_var+'_processed'].str.contains("http://"),1,0)
  data[text_var+'_retweet'] = np.where(data[text_var+'_processed'].str.contains("rt @"),1, 0)

  data[text_var+'_processed'] = data[text_var+'_processed'].apply(lambda x: replace_all(x,replacements_dict))
  data[text_var+'_punctuations_count'] = data[text_var+'_processed'].apply(lambda x: re.findall('[%s]' % re.escape(string.punctuation),str(x))).str.len() 
  data[text_var+'_word_count'] = data[text_var+'_processed'].apply(lambda x: len(str(x).split()))
  data[text_var+'_hashtag_count'] = data[text_var+'_processed'].apply(lambda x: len([c for c in str(x) if c == '#']))
  
  data[text_var+'_processed'] = data[text_var+'_processed'].apply(lambda x: lemmatize_text(x))
  data[text_var+'_processed'] = data[text_var+'_processed'].apply(lambda x: text_preprocess(x))

  return data

In [9]:
# feature engineering of n-grams
def feature_engineering(data):
  features = []

  for i, j in Counter(create_corpus(data, "text_processed", "non-hash")).items():
    if j > 30:
      features.append(i)

  for i, j in Counter(create_corpus(data, "text_processed", "hash")).items():
    if j > 10:
      features.append(i)

  for i, j in get_top_tweet_ngrams(data["text_processed"], 2):
    if j > 10:
      features.append(i)

  for i, j in get_top_tweet_ngrams(data["text_processed"], 3):
    if j > 5:
      features.append(i)

  print(features)
  print(len(features))

  for i in features:
    data[i+"_ngram"] = np.where(data["text_processed"].str.contains(i),1,0)
  
  data.columns = data.columns.str.replace(' ', '_')
  
  return data

In [10]:
tweets_train_x = feature_engineering(preprocessing_fn(tweets_train, "text"))
tweets_train_x.head()

['report', 'suspect', 'custody', 'man', 'terrorist', 'still', 'boat', 'dzhokhar', 'tsarnaev', 'terror', 'president', 'street', 'watertown', 'boston', 'old', 'day', 'one', 'win', 'look', 'job', 'cambridge', 'bomb', 'attack', 'let', 'us', 'make', 'great', 'thing', 'like', 'bostonstrong', 'never', 'find', 'give', 'heart', 'get', 'time', 'want', 'dead', 'marathon', 'bombing', 'start', 'prayforboston', 'happen', 'prayer', 'go', 'family', 'come', 'tonight', 'back', 'good', 'first', 'place', 'new', 'run', 'event', 'even', 'police', 'everyone', 'today', 'city', 'home', 'life', 'tamerlan', 'via', 'pray', 'world', 'bostonmarathon', 'service', 'assistant', 'area', 'week', 'work', 'software', 'manager', 'two', 'see', 'team', 'help', 'w', 'friend', 'update', 'manhunt', 'think', 'would', 'l', 'celtic', 'ticket', 'snow', 'tomorrow', 'year', 'love', 'high', 'full', 'right', 'talk', 'use', 'house', 'know', 'victim', 'please', 'many', 'take', 'live', 'redsox', 'official', 'say', 'brother', 'game', 'engi

Unnamed: 0,text,label,text_processed,text_URL_present,text_retweet,text_punctuations_count,text_word_count,text_hashtag_count,report_ngram,suspect_ngram,custody_ngram,man_ngram,terrorist_ngram,still_ngram,boat_ngram,dzhokhar_ngram,tsarnaev_ngram,terror_ngram,president_ngram,street_ngram,watertown_ngram,boston_ngram,old_ngram,day_ngram,one_ngram,win_ngram,look_ngram,job_ngram,cambridge_ngram,bomb_ngram,attack_ngram,let_ngram,us_ngram,make_ngram,great_ngram,thing_ngram,like_ngram,bostonstrong_ngram,never_ngram,find_ngram,give_ngram,heart_ngram,get_ngram,time_ngram,want_ngram,dead_ngram,marathon_ngram,bombing_ngram,start_ngram,prayforboston_ngram,happen_ngram,prayer_ngram,go_ngram,family_ngram,come_ngram,tonight_ngram,back_ngram,good_ngram,first_ngram,place_ngram,new_ngram,run_ngram,event_ngram,even_ngram,police_ngram,everyone_ngram,today_ngram,city_ngram,home_ngram,life_ngram,tamerlan_ngram,via_ngram,pray_ngram,world_ngram,bostonmarathon_ngram,service_ngram,assistant_ngram,area_ngram,week_ngram,work_ngram,software_ngram,manager_ngram,two_ngram,see_ngram,team_ngram,help_ngram,w_ngram,friend_ngram,update_ngram,manhunt_ngram,think_ngram,would_ngram,l_ngram,celtic_ngram,ticket_ngram,snow_ngram,tomorrow_ngram,year_ngram,love_ngram,high_ngram,full_ngram,right_ngram,talk_ngram,use_ngram,house_ngram,know_ngram,victim_ngram,please_ngram,many_ngram,take_ngram,live_ngram,redsox_ngram,official_ngram,say_ngram,brother_ngram,game_ngram,engineer_ngram,news_ngram,scene_ngram,believe_ngram,end_ngram,fbi_ngram,hope_ngram,mass_ngram,senior_ngram,morning_ngram,tell_ngram,tragedy_ngram,stop_ngram,guy_ngram,school_ngram,open_ngram,feel_ngram,show_ngram,big_ngram,massachusetts_ngram,anyone_ngram,law_ngram,sale_ngram,bos_ngram,seek_ngram,next_ngram,capture_ngram,really_ngram,people_ngram,stay_ngram,could_ngram,last_ngram,hour_ngram,thank_ngram,community_ngram,bruin_ngram,st_ngram,college_ngram,may_ngram,student_ngram,photo_ngram,officer_ngram,way_ngram,rt_ngram,need_ngram,night_ngram,call_ngram,break_ngram,part_ngram,kill_ngram,arrest_ngram,watch_ngram,fire_ngram,monday_ngram,video_ngram,nba_ngram,celtics_ngram,support_ngram,red_ngram,state_ngram,search_ngram,hire_ngram,join_ngram,well_ngram,business_ngram,keep_ngram,scanner_ngram,hear_ngram,move_ngram,much_ngram,post_ngram,boston_job_ngram,dzhokhar_tsarnaev_ngram,job_boston_ngram,boston_marathon_ngram,bos_boston_ngram,boston_celtics_ngram,marathon_attack_ngram,bombing_suspect_ngram,boston_news_ngram,year_old_ngram,tamerlan_tsarnaev_ngram,boston_police_ngram,suspect_custody_ngram,suspect_dzhokhar_ngram,marathon_bombing_ngram,news_boston_ngram,boston_boston_ngram,cambridge_job_ngram,boston_prayforboston_ngram,let_us_ngram,part_time_ngram,terrorist_attack_ngram,boston_area_ngram,manager_boston_ngram,terror_attack_ngram,watertown_manhunt_ngram,come_boston_ngram,bostonmarathon_prayforboston_ngram,bomb_suspect_ngram,watertown_boston_ngram,manhunt_watertown_ngram,marathon_terror_ngram,law_enforcement_ngram,terror_bombing_ngram,red_sox_ngram,go_boston_ngram,boston_watertown_ngram,boston_bombing_ngram,boston_redsox_ngram,boston_common_ngram,full_time_ngram,bostonstrong_prayforboston_ngram,break_news_ngram,back_boston_ngram,prayforboston_bostonstrong_ngram,attack_boston_ngram,prayer_go_ngram,tsarnaev_cambridge_ngram,police_officer_ngram,thought_prayer_ngram,boston_college_ngram,state_police_ngram,day_boston_ngram,boston_bomb_ngram,boston_terror_ngram,software_engineer_ngram,city_boston_ngram,hunt_search_ngram,search_terror_ngram,terror_justice_ngram,justice_win_ngram,join_us_ngram,look_like_ngram,job_cambridge_ngram,prayforboston_bostonmarathon_ngram,tsarnaev_brother_ngram,boston_attack_ngram,last_night_ngram,boston_red_ngram,capture_hunt_ngram,dzhokar_tsarnaev_ngram,today_prayforboston_ngram,boston_bruins_ngram,boston_bruin_ngram,new_bedford_ngram,bombing_investigation_ngram,bostonmarathon_attack_ngram,win_suspect_ngram,prayforboston_prayfortexas_ngram,despite_report_ngram,report_contrary_ngram,contrary_arrest_ngram,arrest_marathon_ngram,scene_watertown_ngram,boston_get_ngram,boston_senior_ngram,boston_globe_ngram,part_boston_ngram,boat_watertown_ngram,nba_boston_ngram,live_boston_ngram,love_boston_ngram,people_boston_ngram,hour_full_ngram,tsarnaev_custody_ngram,street_watertown_ngram,heart_attack_ngram,watertown_suspect_ngram,job_senior_ngram,feel_like_ngram,source_tell_ngram,bruin_nhl_ngram,take_custody_ngram,custody_new_ngram,bedford_part_ngram,today_boston_ngram,boston_university_ngram,act_terror_ngram,suspect_identify_ngram,watertown_bostonstrong_ngram,boston_today_ngram,prayfortexas_prayforboston_ngram,boston_bostonstrong_ngram,time_boston_ngram,boston_tonight_ngram,cambridge_mass_ngram,people_take_ngram,heart_go_ngram,aero_bu_ngram,prayforboston_boston_ngram,happen_boston_ngram,boston_one_ngram,bostonstrong_watertown_ngram,south_boston_ngram,suspect_dzhokhar_tsarnaev_ngram,boston_marathon_terror_ngram,boston_marathon_bombing_ngram,dzhokhar_tsarnaev_cambridge_ngram,hunt_search_terror_ngram,search_terror_justice_ngram,terror_justice_win_ngram,marathon_terror_bombing_ngram,boston_red_sox_ngram,capture_hunt_search_ngram,boston_terror_attack_ngram,justice_win_suspect_ngram,win_suspect_custody_ngram,despite_report_contrary_ngram,report_contrary_arrest_ngram,contrary_arrest_marathon_ngram,arrest_marathon_attack_ngram,boston_bombing_suspect_ngram,boston_marathon_attack_ngram,take_custody_new_ngram,custody_new_bedford_ngram,new_bedford_part_ngram,bedford_part_boston_ngram,part_boston_marathon_ngram,terror_bombing_investigation_ngram,boston_bomb_suspect_ngram,hour_full_time_ngram,marathon_bombing_suspect_ngram,people_take_custody_ngram,bombing_suspect_dzhokhar_ngram,tsarnaev_cambridge_mass_ngram,identify_dzhokhar_tsarnaev_ngram,massachusetts_news_boston_ngram,manager_boston_job_ngram,dzhokhar_tsarnaev_custody_ngram,three_people_take_ngram,survive_boston_bomb_ngram,bomb_suspect_identify_ngram,suspect_identify_dzhokhar_ngram,watertown_bpd_scanner_ngram,full_time_salary_ngram,mit_police_officer_ngram,nba_boston_celtics_ngram,boston_bostonmarathon_prayforboston_ngram,job_boston_senior_ngram,boston_job_senior_ngram,burlington_wyle_seek_ngram,job_boston_aero_ngram,boston_aero_bu_ngram,boston_working_hour_ngram,working_hour_full_ngram,job_boston_sr_ngram,dzhokhar_tsarnaev_awake_ngram,tsarnaev_awake_respond_ngram,break_news_three_ngram,news_three_people_ngram,engineer_burlington_wyle_ngram,wyle_seek_experienced_ngram,suspect_tamerlan_tsarnaev_ngram,dzhokhar_tsarnaev_get_ngram,view_house_crazy_ngram,house_crazy_watertown_ngram,ap_survive_boston_ngram,cambridge_mass_ss_ngram,boston_one_community_ngram,one_community_let_ngram,community_let_terror_ngram,suspect_dzhokar_tsarnaev_ngram,attack_boston_marathon_ngram,work_hour_full_ngram,marathon_attack_victim_ngram,year_old_dzhokhar_ngram,old_dzhokhar_tsarnaev_ngram,bomb_suspect_dzhokhar_ngram
1,"RT @wilw NBC reporting suspect alive and in custody. CNN reporting that Quint, Sheriff Brody, unknown bearded man (poss terrorist?) still on boat.",1.0,nbc report suspect alive custody cnn report quint sheriff brody unknown bearded man poss terrorist still boat,0,1,7,21,0,1,1,1,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,#BostonMararthon suspect Dzhokhar Tsarnaev is in custody http://t.co/ubNR5Lreqv http://t.co/ax7bDCIXRH,1.0,bostonmararthon suspect dzhokhar tsarnaev custody,1,0,1,7,1,0,1,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,RT @BreitbartNews Margaret Thatcher Remembered America in the Midst of Terror: President Barack Obama is fond of challenging Ame... http://t.co/3kdeg3ooAv,1.0,margaret thatcher remember america midst terror president barack obama fond challenge ame,1,1,4,17,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15,Spontaneous celebrations in streets of #Watertown #Boston. This is how my 91 yr old Dad describes V-E Day. One battle in War on Terror won.,1.0,spontaneous celebration street watertown boston yr old dad describe v e day one battle war terror win,0,0,6,24,2,0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
25,"#Bostonsgigs Looking for someone for dishwasher install (Charlestown, Ma) http://t.co/BJQpvI5MfQ #Boston",0.0,bostonsgig look someone dishwasher install charlestown boston,1,0,5,10,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
def baseline_model(data, text_var):
  data = feature_engineering(preprocessing_fn(data, text_var))
  


In [12]:
tweets_train_x[["text", "text_processed"]].head()

Unnamed: 0,text,text_processed
1,"RT @wilw NBC reporting suspect alive and in custody. CNN reporting that Quint, Sheriff Brody, unknown bearded man (poss terrorist?) still on boat.",nbc report suspect alive custody cnn report quint sheriff brody unknown bearded man poss terrorist still boat
2,#BostonMararthon suspect Dzhokhar Tsarnaev is in custody http://t.co/ubNR5Lreqv http://t.co/ax7bDCIXRH,bostonmararthon suspect dzhokhar tsarnaev custody
6,RT @BreitbartNews Margaret Thatcher Remembered America in the Midst of Terror: President Barack Obama is fond of challenging Ame... http://t.co/3kdeg3ooAv,margaret thatcher remember america midst terror president barack obama fond challenge ame
15,Spontaneous celebrations in streets of #Watertown #Boston. This is how my 91 yr old Dad describes V-E Day. One battle in War on Terror won.,spontaneous celebration street watertown boston yr old dad describe v e day one battle war terror win
25,"#Bostonsgigs Looking for someone for dishwasher install (Charlestown, Ma) http://t.co/BJQpvI5MfQ #Boston",bostonsgig look someone dishwasher install charlestown boston
