# Machine Learning Project

## Spam Emails Filtering using Scikit-Learn and RandomForestClassifier

### Step 1 - Import Required Modules

In [None]:
# String Module
import string

# Numpy and Pandas Modules
import numpy as np
import pandas as pd

# Natural Language Toolkit Library
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Scikit-Learn Library
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

### Step 2 - Download Stopwords from NLTK

In [None]:
nltk.download('stopwords')

### Step 3 - Load and Clean Data for Usage

In [None]:
df = pd.read_csv('spam_ham_dataset.csv')
df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ' '))
df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ' '))

### Step 4 - Preprocess the Data to Make it Suitable for Training

In [None]:
stemmer = PorterStemmer()
corpus = []

stopwords_set = set(stopwords.words('english'))

for i in range(len(df)):
    text = df['text'].iloc[i].lower()
    text = text.translate(str.maketrans('', '', string.punctuation)).split()
    text = [stemmer.stem(word) for word in text if word not in stopwords_set]
    text = ' '.join(text)
    corpus.append(text)

### Step 5 - Convert Text into Vectors and Split Data into Training and Testing Sets

In [None]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus).toarray()
y = df.label_num

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Step 6 - Train the Machine Learning Model (RandomForestClassifier)

In [None]:
classifier = RandomForestClassifier(n_jobs = -1)

classifier.fit(X_train, y_train)

### Step 7 - Check the Performance of the Model

In [None]:
classifier.score(X_test, y_test)