# Predict whether or not a tweet is about a real disaster 

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Data Overview

There are 7,613 training data points and 3,263 test datapoints

In [2]:
def df_shape(filename):
    """
        Return the shape (row,columns) of the dataset in the .csv file
        
        @P: filename (string) of a csv data set file
        @R: tuple of dataframe size
        
    """
    df=pd.read_csv(filename)
    return df.shape

def hist_rel_freq(filename,columnName,mx_val):
    """
        Save a relative frequency of a specific column
        
        @P: filename: (csv) the training data file
            columnName: (dtring) the name of the column of interest
            mx_val: (int) the max range for the x-axis
            
    """
    df=pd.read_csv(filename)
    fig, ax = plt.subplots()
    g=sns.histplot(data=df, x=columnName, stat="percent", discrete=True, ax=ax)
    ax.set_xlim(-1,mx_val+1)
    ax.set_xticks(range(0,mx_val+1))
    plt.savefig('output/dist_{}.png'.format(columnName))
    print("\n Column: {} Relative Frequency \n".format(columnName),df[columnName].value_counts(normalize=True))

In [3]:
print("The shape of the training data is {} ".format(df_shape("input/train.csv")))
print("The shape of the test data is {} ".format(df_shape("input/test.csv")))
hist_rel_freq("input/train.csv","target",1)

FileNotFoundError: [Errno 2] File b'input/train.csv' does not exist: b'input/train.csv'

# Split Training Data

In [None]:
df=pd.read_csv("input/train.csv")

X=df.drop('target', axis=1, inplace=False)
y=df['target']
X_train, X_validate, y_train, y_validate = train_test_split(df, y, test_size=0.3, random_state=42)

# Pre-process

• Convert all the words to lowercase

• Lemmatize all thewords (i.e., convert every word to its root so that all of “running,” “run,” and “runs” are converted to “run” and and all of “good,” “well,” “better,” and “best” are converted to “good”; this is easily done using nltk.stem)

• Strip punctuation

• Strip the stop words, e.g., “the”, “and”, “or”

• Strip @ and urls (It’s Twitter.)

• Something else? Tell us about it

In [None]:
def make_lower(col_name,dataF):
    """
        Convert all words to lowercase.
        
        @P: 
        col (string): Name of column in a dataframe that contains text data
        dataF (dataframe): Dataframe with the text data that needs modifying
        
    """
    dataF[col_name]=dataF[col_name].str.lower()
    
def remove_punctuation(col_name,dataF):
    """
        Remove the punctuation in a column with text data.
        
        @P: 
        col (string): Name of column in a dataframe that contains text data
        dataF (dataframe): Dataframe with the text data that needs modifying
        
    """
    dataF[col_name]=dataF[col_name].str.replace(r'[^\w\s]+','') #Regex to identify anything that is not a word or string

def lematize_txt(txt_col):
    
    ps = PorterStemmer()
    words = word_tokenize(txt_col)
    w_arr=[ps.stem(w) for w in words]
    s=" ".join(w_arr)
    return s


In [None]:
make_lower("text",df)
remove_punctuation("text",df)
df['text'] = df.apply(lambda x: lematize_txt(x['text']),axis=1)

df.head()