<a href="https://www.kaggle.com/code/priyanagda/logistic-regression?scriptVersionId=108837770" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import pandas as pd
import os
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
train_path = "../input/nlp-getting-started/train.csv"
test_path = "../input/nlp-getting-started/test.csv"
sub_path = "../input/nlp-getting-started/sample_submission.csv"

In [None]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sub_df = pd.read_csv(sub_path)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.drop(["id", "keyword", "location"], inplace=True, axis=1)
test_df.drop(["id", "keyword", "location"], inplace=True, axis=1)

In [None]:
train_df.head()

In [None]:
#changing the contents of selftext to lowercase
train_df.loc[:,'text'] = train_df.text.apply(lambda x : str.lower(x))

In [None]:
train_df.head()

In [None]:
# Removing '#', hyper link, latin characters and digits
train_df['text'] = train_df['text'].str.replace('http.*.*', '',regex = True)
train_df['text'] = train_df['text'].str.replace('û.*.*', '',regex = True)
train_df['text'] = train_df['text'].str.replace(r'\d+','',regex= True)
train_df['text'] = train_df['text'].str.replace('#*', '',regex = True)

In [None]:
train_df.head()

In [None]:
# Instantiate Tokenizer
tokenizer = RegexpTokenizer(r'\w+') 
# "Run" Tokenizer
train_df['tokens'] = train_df['text'].map(tokenizer.tokenize)

In [None]:
train_df.head()

In [None]:
# We need to remove stopwords now
train_df['tokens'] = train_df['tokens'].apply(lambda x: [item for item in x if item not in stopwords.words()])

In [None]:
train_df.head()

In [None]:
# Next step is stemming/lemmatization
ps = PorterStemmer()
train_df['tokens'] = train_df['tokens'].apply(lambda x: ' '.join([ps.stem(word) for word in x]))

In [None]:
train_df.head()

In [None]:
# train_df.to_csv('./processed_train.csv', index=False)

In [None]:
# Preprocessing the test data
test_df.loc[:,'text'] = test_df.text.apply(lambda x : str.lower(x))
test_df['text'] = test_df['text'].str.replace('http.*.*', '',regex = True)
test_df['text'] = test_df['text'].str.replace('û.*.*', '',regex = True)
test_df['text'] = test_df['text'].str.replace(r'\d+','',regex= True)
test_df['text'] = test_df['text'].str.replace('#*', '',regex = True)
test_df['tokens'] = test_df['text'].map(tokenizer.tokenize)
test_df['tokens'] = test_df['tokens'].apply(lambda x: [item for item in x if item not in stopwords.words()])
test_df['tokens'] = test_df['tokens'].apply(lambda x: ' '.join([ps.stem(word) for word in x]))

In [None]:
test_df.head()

In [None]:
# test_df.to_csv('./processed_test.csv', index=False)

In [None]:
# train_df = pd.read_csv('./processed_train.csv')
# test_df = pd.read_csv('./processed_test.csv')

In [None]:
corpus = ' '.join(list(train_df.tokens))

In [None]:
vocab = list(set(corpus.split()))

In [None]:
len(vocab)

In [None]:
disaster_tweets = ' '.join(train_df.where(train_df.target == 1).dropna().tokens.values)

In [None]:
no_disaster_tweets = ' '.join(train_df.where(train_df.target == 0).dropna().tokens.values)

In [None]:
vocab = sorted(vocab)

In [None]:
disaster_freq = {i:0 for i in vocab}
no_disaster_freq = {i:0 for i in vocab}

In [None]:
for word in disaster_tweets.split():
    disaster_freq[word] += 1

In [None]:
for word in no_disaster_tweets.split():
    no_disaster_freq[word] += 1

In [None]:
# Creating a feature vector
X = np.zeros((len(train_df),2))

In [None]:
for i in range(len(train_df)):
    for word in train_df.tokens[i].split():
        X[i,0] += disaster_freq[word]
        X[i,1] += no_disaster_freq[word]

In [None]:
# doing the same for test data
# i.e creating a feature vector
X_test = np.zeros((len(test_df),2))
for i in range(len(test_df)):
    for word in test_df.tokens[i].split():
        X_test[i,0] += disaster_freq.get(word, 0)
        X_test[i,1] += no_disaster_freq.get(word, 0)

In [None]:
Y = np.array(train_df.target)

In [None]:
clf = LogisticRegression(random_state=42).fit(X, Y)

In [None]:
Y_test = clf.predict(X_test)

In [None]:
list(Y_test)[:5]

In [None]:
test_df.head()

In [None]:
sub_df.target = Y_test

In [None]:
sub_df.to_csv('submission.csv', index=False)

In [None]:
wordcloud = WordCloud(max_font_size=90, max_words=50, background_color="white", colormap="inferno").generate(disaster_tweets)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title('\nFrequntly occuring words related to Disaster \n\n',fontsize=18)
plt.axis("off")
plt.show()

In [None]:
wordcloud = WordCloud(max_font_size=90, max_words=50, background_color="white", colormap="inferno").generate(no_disaster_tweets)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title('\nFrequntly occuring words related to Disaster \n\n',fontsize=18)
plt.axis("off")
plt.show()

In [None]:
# We could give these words a little extra weightage.
# Add +10 or +5 for every top 'k' words
