In [None]:
import pandas as pd
import numpy as np

In [None]:
from nltk.stem import WordNetLemmatizer()
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [None]:
mail = pd.read_csv('mail.csv')

In [None]:
mail.info()

In [None]:
mail.head()

In [None]:
mail.describe()

### Clean Text with stemming/Lemmatizing

In [None]:
wn = nltk.WordNetLemmatizer()

In [None]:
stopword = stopwords.words('english')

In [None]:
mail['clean'] = mail['Message'].apply(lambda x: " ".join([wn.lemmatize(i).lower() for i in re.sub('[^a-zA-Z0-9]',' ',x).split() if i not in stopword]))

In [None]:
mail

In [None]:
X_train, X_test, y_train, y_test = train_test_split(mail['clean'],mail.label,test_size=0.2, random_state=42)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
nb_pipeline = Pipeline([('countvect', CountVectorizer(lowercase=True)),
                       ('mnb', MultinomialNB(alpha=0.0,class_prior=[0.4, 0.6]))])

In [None]:
model = nb_pipeline.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
metrics.accuracy_score(y_test, y_pred)

In [None]:
vectorizer = model.named_steps['countvect']
mnb = model.named_steps['mnb']

In [None]:
feature_names = vectorizer.get_feature_names()
feature_names = np.asarray

In [None]:
scores = cross_val_score(nb_pipeline, X_train, y_train, cv=5)
print(scores)

In [None]:
pred = nb_pipeline.predict(X_test)

In [None]:
metrics.accuracy_score(y_test, pred)

In [None]:
metrics.confusion_matrix(y_test, pred, labels=[0,1])

In [None]:
metrics.f1_score(y_test, pred, average=None)

In [None]:
metrics.f1_score(y_test, pred, average='macro')

In [None]:
metrics.f1_score(y_test, pred, average='micro')

In [None]:
metrics.f1_score(y_test, pred, average='weighted')

In [None]:
X_test[y_test > pred]

In [None]:
X_test[y_test < pred]

In [None]:
X_test[43]

In [None]:
wn = nltk.WordNetLemmatizer()

In [None]:
def cleanLemm(text):
    text = ''.join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopword]
    return text

In [None]:
mail['clean_lemm'] = mail['Message'].apply(lambda x: cleanLemm(x))

In [None]:
mail.tail()

In [None]:
core = mail['clean_lemm']

In [None]:
core.values

In [None]:
mail.iloc[:, 1]

In [None]:
cv = CountVectorizer()
X = cv.fit_transform(mail['Message']).toarray()
y = mail.iloc[:, 1].values

In [None]:
print(cv.get_feature_names())

### Apply TfidfVectorizer

In [None]:
tfidf_vect = TfidfVectorizer()
X_tfidf = tfidf_vect.fit_transform(mail['Message'])
print(X_tfidf.shape)
print(tfidf_vect.get_feature_names())


### Feature Engineering
Transformations: Power transformations (Square, square root etc) Standardizing Data

In [None]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train,y_train)


In [None]:

# Accuracy: 0.932
clf.score(X_test,y_test)

In [None]:
# Output of the score is the accuracy of the prediction

clf.score(X_train, y_train)

In [None]:
nb = MultinomialNB()
scores = cross_val_score(nb, X_train, y_train, cv=5)
print(scores)

In [None]:
nb_pipeline = Pipeline()

In [None]:

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [None]:
pred = nb_classifier.predict(X_test)

In [None]:
metrics.accuracy_score(y_test, pred)

In [None]:
metrics.confusion_matrix(y_test, pred, labels=[0,1])

In [None]:
metrics.f1_score(y_test, pred, average='macro')

In [None]:
metrics.f1_score(y_test, pred, average='micro')

In [None]:
metrics.f1_score(y_test, pred, average='weighted')

In [None]:
metrics.f1_score(y_test, pred, average=None)

In [None]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 250],
    'max_depth': [2, 4, 8, 16, 32, None],
    'criterion':['gini', 'entropy']
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

In [None]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [None]:
X_train.shape, X_test.shape