<h1>Explore Spam Ham Tweets</h1>


# Introduction

We will analyse the data distribution for this Spam/Ham tweets dataset.


# Data preparation

## Load packages

In [None]:
import numpy as np 
import pandas as pd
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
from wordcloud import WordCloud, STOPWORDS
import warnings
warnings.simplefilter("ignore")

## Load data

In [None]:
tweets_df = pd.read_csv("/kaggle/input/spamham/spam.csv", encoding="latin1")

# Data exploration


## Glimpse the data

In [None]:
print(f"data shape: {tweets_df.shape}")

In [None]:
tweets_df.info()

In [None]:
tweets_df.describe()

In [None]:
tweets_df.head()

In [None]:
tweets_df.columns = ["class", "text", "c3", "c4", "c5"]

### Fix issue with wrong columns

Wrong columns (`c3`, `c4`, `c5`) will be merged with corresponding `text` column values.

In [None]:
tweets_df.head()

In [None]:
tweets_df.c3.value_counts()

In [None]:
tweets_df.c4.value_counts()

In [None]:
tweets_df.c5.value_counts()

Merging the text.

In [None]:
tweets_df['text'] = tweets_df.apply(lambda x: x['text'] + str(x['c3']) + str(x['c4']) + str(x['c5']), axis=1)

Drop the three columns now.

In [None]:
tweets_df = tweets_df[["class", "text"]]

In [None]:
tweets_df.head()

### Missing data

In [None]:
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [None]:
missing_data(tweets_df)

### Unique values

In [None]:
def unique_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    uniques = []
    for col in data.columns:
        unique = data[col].nunique()
        uniques.append(unique)
    tt['Uniques'] = uniques
    return(np.transpose(tt))

In [None]:
unique_values(tweets_df)

### Most frequent values

In [None]:
def most_frequent_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    items = []
    vals = []
    for col in data.columns:
        itm = data[col].value_counts().index[0]
        val = data[col].value_counts().values[0]
        items.append(itm)
        vals.append(val)
    tt['Most frequent item'] = items
    tt['Frequence'] = vals
    tt['Percent from total'] = np.round(vals / total * 100, 3)
    return(np.transpose(tt))

In [None]:
most_frequent_values(tweets_df)

## Visualize the data distribution

In [None]:
def plot_count(feature, title, df, size=1, ordered=True):
    f, ax = plt.subplots(1,1, figsize=(4*size,4))
    total = float(len(df))
    if ordered:
        g = sns.countplot(df[feature], order = df[feature].value_counts().index[:20], palette='Set3')
    else:
        g = sns.countplot(df[feature], palette='Set3')
    g.set_title("Number and percentage of {}".format(title))
    if(size > 2):
        plt.xticks(rotation=90, size=8)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height,
                '{:1.2f}%'.format(100*height/total),
                ha="center") 
    plt.show()    

### Class distribution

In [None]:
plot_count("class", "Class", tweets_df,2)

### Text wordclouds

We will remove, before creating the wordclouds, the frequent or parasite terms. Besides the stopwords, we will also add internet/tweets specific content, as well as the ubiquous "nannannan" word in this dataset.

In [None]:
from wordcloud import WordCloud, STOPWORDS
def show_wordcloud(data, title=""):
    text = " ".join(t for t in data.dropna())
    stopwords = set(STOPWORDS)
    stopwords.update(["t", "co", "https", "amp", "U", "nannannan"])
    wordcloud = WordCloud(stopwords=stopwords, scale=4, max_font_size=50, max_words=500,background_color="black").generate(text)
    fig = plt.figure(1, figsize=(16,16))
    plt.axis('off')
    fig.suptitle(title, fontsize=20)
    fig.subplots_adjust(top=2.3)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.show()

### Text wordcloauds

In [None]:
show_wordcloud(tweets_df['text'], title = 'Prevalent words in tweets')

In [None]:
ham_df = tweets_df.loc[tweets_df['class']=="ham"]
show_wordcloud(ham_df['text'], title = 'Prevalent words in tweets in class ham')

In [None]:
spam_df = tweets_df.loc[tweets_df['class']=="spam"]
show_wordcloud(spam_df['text'], title = 'Prevalent words in tweets in class spam')