# **My First Notebook of NLP**

Referred to NLP Getting Started Tutorial(https://www.kaggle.com/philculliton/nlp-getting-started-tutorial) and Natural Language Processing courses(https://www.kaggle.com/learn/natural-language-processing).






In [None]:
#import library

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from functools import partial
import optuna
from xgboost import XGBRegressor


from sklearn import feature_extraction,linear_model,model_selection,preprocessing

from sklearn.metrics import mean_squared_error,roc_auc_score,precision_score,accuracy_score,log_loss


In [None]:
df_train = pd.read_csv("../input/nlp-getting-started/train.csv")
df_test =pd.read_csv("../input/nlp-getting-started/test.csv")

# **Explore the Data**

In [None]:
df_train.describe()

In [None]:
#Ratio of disaster or not.

ax =sns.countplot(x='target',data=df_train)
ax.set_xticklabels(['0: Not Disaster','1: Disaster'],ha="center")
plt.title("Disaster or Not")
plt.style.use("seaborn-whitegrid")
total= len(df_train.target)
for p in ax.patches:
    percentage = f'{100 * p.get_height() / total:.1f}%\n'
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    ax.annotate(percentage, (x, y), ha='center', va='center')

In [None]:
#not disaster example
df_train[df_train['target'] ==0].tail(100)

In [None]:
#disaster example
df_train[df_train['target']==1].head(100)
    

# **New Featuer Creations**
I did not have any good ideas how to make new features from the data, but I found a really good notebook to refer for that. NLP with Disaster Tweets - EDA, Cleaning and BERT(https://www.kaggle.com/gunesevitan/nlp-with-disaster-tweets-eda-cleaning-and-bert). 
I am going to use some ideas from  this notebook.

In [None]:
#word_count

df_train['word_count'] =df_train['text'].apply(lambda x: len(str(x).split()))
df_test['word_count'] =df_test['text'].apply(lambda x: len(str(x).split()))

#unique_word_count

df_train['unique_word_count'] =df_train['text'].apply(lambda x:len(set(str(x).split())))
df_test['unique_word_count'] =df_test['text'].apply(lambda x:len(set(str(x).split())))

#stop_word_count

from nltk.corpus import stopwords
stopwords_en=set(stopwords.words('english'))

df_train['stop_word_count'] =df_train['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords_en]))
df_test['stop_word_count'] =df_test['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords_en]))


#mean_word_length

df_train['mean_word_length'] = df_train['text'].apply(lambda x:np.mean([len(w) for w in str(x).split()]))
df_test['mean_word_length'] = df_test['text'].apply(lambda x:np.mean([len(w) for w in str(x).split()]))

#char_count
df_train['char_count'] = df_train['text'].apply(lambda x: len(str(x)))
df_test['char_count'] =df_test['text'].apply(lambda x: len(str(x)))

#punctuation_count
from string import punctuation

punctuations = set(punctuation)

df_train['punctuation_count'] = df_train['text'].apply(lambda x:len([c for c in str(x) if c in punctuations]))
df_test['punctuation_count'] = df_test['text'].apply(lambda x:len([c for c in str(x) if c in punctuations]))

In [None]:
new_features=['word_count','unique_word_count','stop_word_count','mean_word_length','char_count','punctuation_count']

In [None]:
from sklearn import preprocessing
scaler= preprocessing.MinMaxScaler()
df_train[new_features] = scaler.fit_transform(df_train[new_features])
df_test[new_features] = scaler.transform(df_test[new_features])

In [None]:
#keywords
print("keywords:" + str(len(df_train.keyword.unique())))
print("Total tweets:" + str(len(df_train)) +"\n")
      
print(str(df_train.keyword.unique()))


In [None]:
#location
print("keywords:" + str(len(df_train.location.unique())))
print("Total tweets:" + str(len(df_train)) +"\n")

print(str(df_train.location.unique()))


**Using keywords to make new feature**

Too many unique values in the location column to use them as features compared with keywords.
Let's see which keywords are useful to predict if a predict is a disaster or not.

In [None]:
df_train_withkeywords = df_train.dropna(subset=['keyword'],axis=0)

In [None]:
df_train_withkeywords.head(5)


In [None]:
df_train_withkeywords['target_mean'] = df_train_withkeywords.groupby('keyword')['target'].transform('mean')

In [None]:
df_train_withkeywords

In [None]:
#This code is from Referred to Basic NLP with NLTK(https://www.kaggle.com/alvations/basic-nlp-with-nltk/notebook)
# as mentioned below.


fig =plt.figure(figsize=(12,72),dpi=100)

sns.countplot(y=df_train_withkeywords.sort_values(by='target_mean',ascending=False)['keyword'],
             hue=df_train_withkeywords.sort_values(by='target_mean',ascending=False)['target'])

plt.tick_params(axis='x',labelsize=15)
plt.tick_params(axis='y',labelsize=12)
plt.legend(loc=1)
plt.title('Target Distribution in Keywords')

plt.show()



In [None]:
g= df_train_withkeywords.groupby(by='keyword')['target_mean'].apply(lambda x:list(np.unique(x)))


In [None]:
disaster_keywords=[]
non_disaster_keywords=[]

for word in df_train_withkeywords.keyword.unique():
    if g[word][0] >0.7:
        disaster_keywords.append(word)
    if g[word][0] <0.3:
        non_disaster_keywords.append(word)
        

In [None]:
#Informative keywords list


def disaster(keyword):
    if keyword in disaster_keywords:
        return 1
    return 0
def nondisaster(keyword):
    if keyword in non_disaster_keywords:
        return 1
    return 0
    
df_train['is_disaster_keyword'] =df_train['keyword'].apply(disaster)
df_train['is_non_disaster_keyword'] =df_train['keyword'].apply(nondisaster)

df_test['is_disaster_keyword'] =df_test['keyword'].apply(disaster)
df_test['is_non_disaster_keyword'] =df_test['keyword'].apply(nondisaster)


# **Text Cleaning**

Referred to Basic NLP with NLTK(https://www.kaggle.com/alvations/basic-nlp-with-nltk/notebook).

In [None]:
#example

#from nltk import sent_tokenize, word_tokenize
#text = df_train.text[0]
#for sent in sent_tokenize(text):
#    print([word.lower() for word in word_tokenize(sent)])

In [None]:
#text0_list = list(map(str.lower, word_tokenize(text)))

#remove stopwords(non-content words)

#from nltk.corpus import stopwords
#stopwords_en=set(stopwords.words('english'))
#remove punctuations

#from string import punctuation
#stopwords_en_withpunct = stopwords_en.union(set(punctuation))


#text0_list1=[word for word in text0_list if word not in stopwords_en_withpunct]
#print(text0_list1)

In [None]:
#from nltk.stem import PorterStemmer
#porter=PorterStemmer()
#for word in text0_list1:
#    print(porter.stem(word))

In [None]:
#def preprocessing(text):
#    li1 = [word.lower() for word in text.split()]
#    li2 =  [word for word in li1 if word not in stopwords_en_withpunct]
#    li3= [porter.stem(word) for word in li2]
#    return ' '.join(li3)

In [None]:
#df_train.text[2]

In [None]:
#preprocessing(df_train.text[2])

In [None]:
#df_train['cleaned_text'] = df_train['text'].apply(preprocessing)
#df_test['cleaned_text'] =df_test['text'].apply(preprocessing)

In [None]:
#df_train.head(100)

# **Building Vectors**

Use sckit-learn's countvectorizer to count the words in each tweet and turn them into data that machine learning model can process.

In [None]:
count_vectorizer = feature_extraction.text.CountVectorizer()

example_train_vectors = count_vectorizer.fit_transform(df_train['text'][0:5])

In [None]:
print(example_train_vectors[0])

In [None]:
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

In [None]:
train_vectors = count_vectorizer.fit_transform(df_train["text"])
test_vectors = count_vectorizer.transform(df_test['text'])

In [None]:
#making dataframe from the vector

vect_train_df= pd.DataFrame(train_vectors.todense(),columns=count_vectorizer.get_feature_names())
vect_test_df =pd.DataFrame(test_vectors.todense(),columns=count_vectorizer.get_feature_names())



In [None]:
vect_train_df['is_disaster_keyword'] = df_train['is_disaster_keyword']
vect_train_df['is_nondisaster_keyword'] =df_train['is_non_disaster_keyword']
vect_test_df['is_disaster_keyword'] = df_test['is_disaster_keyword']
vect_test_df['is_nondisaster_keyword'] =df_test['is_non_disaster_keyword']

In [None]:
vect_train_df=vect_train_df.join(df_train[new_features])
vect_test_df=vect_test_df.join(df_train[new_features])

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train,y_test = train_test_split(vect_train_df,df_train.target,test_size=0.3,random_state=1)

# **Create Model**

In [None]:
X_train.shape

In [None]:
from tensorflow import keras
from tensorflow.keras import layers, callbacks

early_stopping = callbacks.EarlyStopping(
    min_delta=0.001, # minimium amount of change to count as an improvement
    patience=10, # how many epochs to wait before stopping
    restore_best_weights=True,
)

model = keras.Sequential([
    keras.layers.Flatten(),
    keras.layers.Dropout(0.2),
   # keras.layers.Dense(300, activation='relu'),
   # keras.layers.Dropout(0.5),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.6),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.4),
    keras.layers.Dense(2, activation="softmax")
])



In [None]:
model.compile(optimizer='adam',loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),metrics=['accuracy'])

In [None]:
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)

vect_test=np.asarray(vect_test_df)

In [None]:
history = model.fit(
X_train,y_train,
validation_data =(X_test,y_test),
batch_size=512,
epochs =1000,
callbacks=[early_stopping],
)

In [None]:
len(model.predict(X_test))
len(X_test)

In [None]:
y_pred = model.predict(X_test)
y_pred = [a.argmax() for a in y_pred]


print(mean_squared_error(y_test,y_pred,squared=False))
print(roc_auc_score(y_test,y_pred))


#0.4333917065178553
#0.7976782008772676


In [None]:
sample_submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")

In [None]:
vect_test= [a.argmax() for a in model.predict(vect_test)]
sample_submission["target"] =vect_test

In [None]:
sample_submission.to_csv("submission.csv",index=False)