# James's EDA and Data Exploration

### Import Libraries and Facebook Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
facebook = pd.read_csv('assets/facebook.csv', index_col = 0)

In [None]:
facebook.head()

In [None]:
# display the entirety of cells, no ellipsis truncation
pd.set_option('display.max_colwidth', -1)

### Basic analysis and exploration

In [None]:
# describe numeric columns

facebook.describe()

In [None]:
facebook.groupby('brand').reaction_count.median()

In [None]:
facebook.groupby('brand').angry_count.describe()

In [None]:
facebook.groupby('brand').like_count.describe()

In [None]:
facebook.groupby('brand').shares.describe()

In [None]:
# top-shared post overall is from Glamour Magazine with a video depicting the menstrual cycle

facebook[(facebook['brand']==137316) & (facebook['shares']==max(facebook['shares']))].permalink

In [None]:
# most shares overall

facebook.sort_values(by=['shares'], ascending=False).head()

In [None]:
# most reactions

facebook[['brand','impact','message','permalink','reaction_count']].sort_values(by=['reaction_count'], ascending=False).head(10)

In [None]:
facebook[['brand','impact','message','permalink','reaction_count','shares']].sort_values(by=['shares'], ascending=False).head(10)

### Use CountVectorizer to build vocabulary

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, learning_curve, train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.pipeline import Pipeline

%matplotlib inline

sns.set_style("darkgrid")

In [None]:
# initialize count vectorizer

cvt = CountVectorizer()
cvt_stop = CountVectorizer(stop_words='english')

In [None]:
# split the message data into train/test along with likes

X_train, X_test, y_train, y_test = train_test_split(facebook['message'],facebook['like_count'], test_size=0.3, random_state=6)

In [None]:
message_trn = cvt_stop.fit_transform(X_train)

# store words and frequencies in arrays, then combine
m_words = cvt_stop.get_feature_names()
m_freqs = message_trn.toarray().sum(axis=0)

# put the above into a single array, and make that array a DataFrame
mess_freq = [[m_words[ii], m_freqs[ii]] for ii in range(len(m_words))]
messageDF = pd.DataFrame(data=mess_freq)
messageDF.columns = ['word','frequency']

# show the 20 most commonly used words
messageDF.sort_values('frequency',ascending=False)[:20]

In [None]:
# CVT with ngrams of 2
cvt_stop_2wp = CountVectorizer(stop_words='english', ngram_range=(2,2))

msg_train = cvt_stop_2wp.fit_transform(X_train)

# store words and frequencies in arrays, then combine
msg_wds = cvt_stop_2wp.get_feature_names()
msg_frq = msg_train.toarray().sum(axis=0)

# put the above into a single array, and make that array a DataFrame
msg_freq = [[msg_wds[ii], msg_frq[ii]] for ii in range(len(msg_wds))]
phrase2DF = pd.DataFrame(data=msg_freq)
phrase2DF.columns = ['word','frequency']

# show the 20 most commonly used words
phrase2DF.sort_values('frequency',ascending=False)[:20]