## Import necessary modules

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud,STOPWORDS  #Be VERY CAREFUL, it is "WordCloud" and not "wordcloud" after the import functioin
import re

In [None]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

### A quick look at the data

Example of a tweet that is NOT a disaster tweet.

In [None]:
train_df.head()

In [None]:
train_df[train_df["target"] == 0]["text"].values[1]

And one that is:

In [None]:
train_df[train_df["target"] == 1]["text"].values[1]

In [None]:
# Create a duplicate dataframe
hope = train_df.copy()

In [None]:
hope.shape

#### REMOVE HYPERLINKS

In [None]:
# How many rows contain hyperlinks
hope['text'].str.contains('http?').sum()

In [None]:
#Remove hyperlinks
def remove_http(review):
    url_pattern = re.compile(r'href|http\S+')
    return url_pattern.sub(r'', review)

In [None]:
hope['text'] = hope['text'].apply(remove_http)

In [None]:
# How many rows contain hyperlinks
hope['text'].str.contains('http?').sum()

## POPULAR WORDS

In [None]:
#Create a single variable with all the summaries so as to make the removal of STOPWORDS faster
total_text = (' '.join(hope['text']))

total_text[0:1000]

In [None]:
#substitute every symbol(!"\'.) except(^) a-z with space in the variable "total_text"
total_text = re.sub('[^a-zA-Z]', ' ', total_text)

total_text[0:1000]

In [None]:
#Remove excess spacing
total_text = re.sub(' +', ' ', total_text)

total_text[0:1000]

#### REMOVE STOPWORDS

In [None]:
# Set your stopwords
stop_words2 = set(STOPWORDS)

# Set your second set of stopwords
stop_words = set(stopwords.words('english'))

In [None]:
### Find popular words i.e words that occur regularly in hope['text']

wordcloud = WordCloud(width=1000, height=500, stopwords=stop_words2).generate(total_text)

plt.figure(figsize=(20,10))
plt.imshow(wordcloud)
plt.axis('off') #to remove the axis number from showing

In [None]:
# Tokenize each sentence in the list sense
word_tokens = word_tokenize(total_text)

In [None]:
word_tokens[0:10]

In [None]:
# Create a list(hello) that contains stopwords and a list(filtered_sentence) that contains words without stopwords
filtered_sentence = []
hello = ['amp','don','re','via','st']
for w in word_tokens:
    if w in stop_words2:
        hello.append(w)
    else:
        filtered_sentence.append(w)

In [None]:
# Create an index for the dictionary you will be creating later on
bark = []
for i in range(len(filtered_sentence)):
    bark.append(i)
len(bark)

In [None]:
# Create a dataframe
data = {"id": bark,
        "filtered_sentence": filtered_sentence}
  
df = pd.DataFrame(data)
df

In [None]:
# Create a dataframe that shows the count of each unique word
word_count = df['filtered_sentence'].value_counts(ascending=False)
word_count[1:20]

In [None]:
# Create a list of the dictionary keys
key_list = list(word_count.keys())

In [None]:
# A list of lowercase alphabets + a list of higher alphabets + a list of words in stop_words2
stop_words3 = list(map(chr, range(97, 123))) + list(map(chr, range(65, 90))) + hello
stop_words3[100:110]

In [None]:
#Remove redundadnt words that are not important
for i in key_list:
    if word_count[i] > 184:
        stop_words3.append(i)
        word_count.pop(i)
    elif word_count[i] < 5:
        word_count.pop(i)  

In [None]:
key_list2 = list(word_count.keys())
key_list2[1:20]

In [None]:
### Find popular words i.e words that occur regularly in key_list2
total_text2 = (' '.join(key_list2))

wordcloud2 = WordCloud(width=1000, height=500, stopwords=stop_words3).generate(total_text2)

plt.figure(figsize=(20,10))
plt.imshow(wordcloud2)
plt.axis('off') #to remove the axis number from showing

### VECTORIZE THE COLUMN

In [None]:
count_vectorizer = feature_extraction.text.TfidfVectorizer(stop_words = stop_words3)
example_train_vectors = count_vectorizer.fit_transform(hope['text'][0:5])

In [None]:
#Create vectors from your training data
train_vectors = count_vectorizer.fit_transform(hope['text'])

In [None]:
## note that I'm NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])

### The model

In [None]:
## The vectors are really big, so we want to push the model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()

##### TEST THE MODEL

In [None]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

##### FITTING THE MODEL

In [None]:
clf.fit(train_vectors, train_df["target"])

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [None]:
sample_submission["target"] = clf.predict(test_vectors)

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv("submission.csv", index=False)