In [1]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [2]:
df_train= pd.read_csv('../input/nlp-getting-started/train.csv')
df_test=pd.read_csv('../input/nlp-getting-started/test.csv')

In [3]:
df_test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [4]:
# Use train_test_split to split training data into training and validation sets
train_sentences =df_train["text"].to_numpy()
train_labels = df_train["target"].to_numpy()
# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

In [5]:
test_sentences = df_test["text"].to_numpy()


In [6]:
def clean_dataset(value):
  dataset=[]
  emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
  for i in value:
    text=re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", i)
    text=emoji_pattern.sub(r'', text)
    text=text.lower()
    dataset.append(text)
  return dataset

In [7]:
cleaned_train_data=clean_dataset(train_sentences)

In [8]:
model_0.fit(cleaned_train_data,train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [9]:
cleaned_test_data=clean_dataset(test_sentences)

In [10]:
baseline = model_0.predict(cleaned_test_data)
baseline

array([1, 0, 1, ..., 1, 1, 1])

In [11]:
train_prediction = model_0.predict(cleaned_train_data)
train_prediction

array([1, 1, 1, ..., 1, 1, 1])

In [12]:
from sklearn.metrics import accuracy_score

In [13]:
accuracy_score(train_prediction,train_labels)

0.8838828319978983

In [14]:
test_id= df_test["id"].to_numpy()

In [15]:
submission = pd.DataFrame({'id':test_id, 'target':baseline.flatten()})
submission.to_csv("submission.csv", index=False)