In [1]:
from sklearn.naive_bayes import *       # eg :naive bayes
from sklearn.dummy import *             # Imports dummy estimators, useful for baseline comparisons.
from sklearn.ensemble import *          # eg: random forest, gradient boosting
from sklearn.neighbors import *         # eg: KNN
from sklearn.tree import *              # eg : decision tree

In [4]:
from sklearn.feature_extraction.text import CountVectorizer           #  Converts a collection of text documents to a matrix of token counts.
from sklearn.feature_extraction.text import TfidfVectorizer           #  Converts a collection of raw documents to a matrix of TF-IDF features.
from sklearn.feature_extraction.text import HashingVectorizer         #  Converts a collection of text documents to a matrix of token occurrences using a hashing function.

In [5]:
from sklearn.calibration import *       # Imports methods for calibrating predicted probabilities.
from sklearn.linear_model import *      # eg: linear reg, logistic reg
from sklearn.multiclass import *        # imports strategies for multiclass classification problems
from sklearn.svm import *               # eg: svm
import pandas                           # Imports the pandas library, which is essential for data manipulation and analysis, especially working with DataFrames.

In [6]:
def perform(classifiers, vectorizers, train_data, test_data):   # This function takes lists of classifiers and vectorizers, as well as train_data and test_data as input.
    for classifier in classifiers:
      for vectorizer in vectorizers:    # iterates through each classifier and each vectorizer
        string = ''                     # first we a taking an empty string then later in that string we add the classifiers
        string += classifier.__class__.__name__ + ' with ' + vectorizer.__class__.__name__    # It creates a string to describe the current combination

        # train
        vectorize_text = vectorizer.fit_transform(train_data.v2)        # It trains the vectorizer on the training data (train_data.v2) using fit_transform(). This converts the text data into numerical features.
        classifier.fit(vectorize_text, train_data.v1)                   # It trains the classifier on the vectorized training data and the corresponding labels (train_data.v1) using fit().

        # It transforms the test data (test_data.v2) using the already fitted vectorizer with transform().

        # score
        vectorize_text = vectorizer.transform(test_data.v2)             # It calculates the score of the classifier on the transformed test data and the test labels (test_data.v1) using score(). The score is typically accuracy for classification problems.
        score = classifier.score(vectorize_text, test_data.v1)
        string += '. Has score: ' + str(score)
        print(string)                                                   # It prints the combination of classifier and vectorizer along with its calculated score.


In [7]:
# open data-set and divide it
data = pandas.read_csv('sms_spam.csv', encoding='latin-1')   # reading the file and opening it into a pandas dataframe
# The encoding='latin-1' is specified to handle potential character encoding issues in the file.

learn = data[:4400] # 4400 items  ----> This creates a training dataset by selecting the first 4400 rows of the DataFrame.
test = data[4400:] # 1172 items   ----> This creates a test dataset by selecting the remaining rows from the 4400th row onwards.


In [8]:
learn

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
4395,ham,Dear :-/ why you mood off. I cant drive so i b...,,,
4396,ham,When did dad get back.,,,
4397,ham,Can you tell Shola to please go to college of ...,,,
4398,ham,Yes just finished watching days of our lives. ...,,,


In [9]:
test

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
4400,ham,Many times we lose our best ones bcoz we are,,,
4401,ham,Good FRIENDS CaRE for each Other.. CLoSE Frien...,,,
4402,ham,Just getting back home,,,
4403,ham,"Sorry, I'll call later &lt;#&gt; mins",,,
4404,ham,Dun need to use dial up juz open da browser n ...,,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [10]:
# The code then calls the perform function, passing in lists of various classifier and vectorizer objects that were imported in the previous cells.
# This will execute the training and scoring process for every combination of classifier and vectorizer on the loaded spam data.
perform(
    [
        BernoulliNB(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        CalibratedClassifierCV(),
        DummyClassifier(),
        PassiveAggressiveClassifier(),
        RidgeClassifier(),
        RidgeClassifierCV(),
        SGDClassifier(),
        OneVsRestClassifier(SVC(kernel='linear')),
        OneVsRestClassifier(LogisticRegression()),
        KNeighborsClassifier()
    ],
    [
        CountVectorizer(),
        TfidfVectorizer(),
        HashingVectorizer()
    ],
    learn,
    test
)

BernoulliNB with CountVectorizer. Has score: 0.9778156996587031
BernoulliNB with TfidfVectorizer. Has score: 0.9778156996587031
BernoulliNB with HashingVectorizer. Has score: 0.8728668941979523
RandomForestClassifier with CountVectorizer. Has score: 0.9752559726962458
RandomForestClassifier with TfidfVectorizer. Has score: 0.9778156996587031
RandomForestClassifier with HashingVectorizer. Has score: 0.9692832764505119
AdaBoostClassifier with CountVectorizer. Has score: 0.947098976109215
AdaBoostClassifier with TfidfVectorizer. Has score: 0.9539249146757679
AdaBoostClassifier with HashingVectorizer. Has score: 0.9505119453924915
BaggingClassifier with CountVectorizer. Has score: 0.9650170648464164
BaggingClassifier with TfidfVectorizer. Has score: 0.9650170648464164
BaggingClassifier with HashingVectorizer. Has score: 0.9675767918088737
ExtraTreesClassifier with CountVectorizer. Has score: 0.9778156996587031
ExtraTreesClassifier with TfidfVectorizer. Has score: 0.9803754266211604
ExtraTr

In [11]:
import csv

In [12]:
classifier = OneVsRestClassifier(SVC(kernel='linear'))  # This creates an instance of the OneVsRestClassifier which is a strategy for handling multiclass classification problems.
# It uses a Support Vector Classifier (SVC) with a linear kernel as the base estimator.


vectorizer = TfidfVectorizer()                           # This creates an instance of the TfidfVectorizer, which will be used to convert text data into TF-IDF features.

In [13]:
# train
vectorize_text = vectorizer.fit_transform(learn.v2)    # This line fits the TfidfVectorizer to the text data in the learn DataFrame's 'v2' column and then transforms the text into a matrix of TF-IDF features.
classifier.fit(vectorize_text, learn.v1)               # This line trains the OneVsRestClassifier using the vectorized text data (vectorize_text) and the corresponding labels (learn.v1).

In [14]:
csv_arr = []             # an empty list to store the data

for index, row in learn.iterrows():        # This loop iterates through each row of the learn DataFrame.
    answer = row[0]                        # These lines extract the actual label (answer) and the text content (text) from the current row.
    text = row[1]
    vectorize_text = vectorizer.transform([text])     # This transforms the text of the current row into a TF-IDF feature vector using the already fitted vectorizer. It's important to use transform here, not fit_transform, as the vectorizer has already learned the vocabulary from the training data.
    predict = classifier.predict(vectorize_text)[0]   # This uses the trained classifier to predict the label for the current text.
    if predict == answer:                             # This checks if the predicted label matches the actual label.
        result = 'right'
    else:
        result = 'wrong'
    csv_arr.append([len(csv_arr), text, answer, predict, result])        # Appends a list containing the row number, text, actual answer, predicted answer, and the result ('right' or 'wrong') to the csv_arr.

  answer = row[0]                        # These lines extract the actual label (answer) and the text content (text) from the current row.
  text = row[1]


In [15]:
# write csv
with open('test_score.csv', 'w', newline='') as csvfile:    #  This line opens a file named 'test_score.csv' in write mode ('w'). The newline='' argument is important to prevent extra blank rows from being written in the CSV file. The with statement ensures that the file is automatically closed even if errors occur.
    spamwriter = csv.writer(csvfile, delimiter=';',
            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(['#', 'text', 'answer', 'predict', result])       # This line writes the header row to the CSV file. It includes the column names: '#', 'text', 'answer', 'predict', and 'result'. Note: There is a potential issue here where result is a single value from the last iteration of the previous loop, not the column header 'result'. It should likely be just the string 'result'.

    for row in csv_arr:                                                    # This loop iterates through each row in the csv_arr.
        spamwriter.writerow(row)                                           # This line writes the current row to the CSV file.

In [32]:
import joblib

# Save trained model and vectorizer
joblib.dump(classifier, "spam_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

print("✅ Model and vectorizer saved successfully!")

✅ Model and vectorizer saved successfully!


In [33]:
import os
print(os.listdir())

['.config', 'vectorizer.pkl', 'sms_spam.csv', 'spam_model.pkl', 'test_score.csv', 'sample_data']


In [37]:
!pip install flask-ngrok
from flask import Flask, render_template_string, request
from flask_ngrok import run_with_ngrok
import joblib

# Load model and vectorizer
model = joblib.load("spam_model.pkl")
vectorizer = joblib.load("vectorizer.pkl")

app = Flask(__name__)
run_with_ngrok(app)  # to create public URL in Colab

# simple inline HTML (no external template folder needed)
html_template = """
<!DOCTYPE html>
<html>
<head>
    <title>SMS Spam Detection</title>
</head>
<body style="text-align:center; margin-top:50px; font-family:Arial;">
    <h2>SMS Spam Detection</h2>
    <form action="/predict" method="post">
        <input type="text" name="message" placeholder="Enter message here" required style="width:300px;padding:10px;">
        <br><br>
        <input type="submit" value="Check" style="padding:8px 20px;">
    </form>
    {% if prediction %}
        <h3>Result: {{ prediction }}</h3>
    {% endif %}
</body>
</html>
"""

@app.route('/')
def home():
    return render_template_string(html_template)

@app.route('/predict', methods=['POST'])
def predict():
    message = request.form['message']
    data = [message]
    vect = vectorizer.transform(data).toarray()
    prediction = model.predict(vect)
    result = 'Spam' if prediction[0] == 1 else 'Not Spam'
    return render_template_string(html_template, prediction=result)

app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
Exception in thread Thread-36:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connection.py", line 198, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.12/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connectionpool.py", line 787, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connectionpool.py", line 493, in _make_reque

In [17]:
import os                                                                       # Imports the os module, which provides a way to interact with the operating system. This is often used for accessing environment variables or file paths.
from flask import Flask, render_template, request, redirect, url_for, jsonify
# Imports specific components from the flask library:
# Flask: The main class for creating a Flask application instance.
# render_template: A function for rendering HTML templates.
# request: An object that holds incoming request data.
# redirect: A function for redirecting the user to a different endpoint.
# url_for: A function for dynamically building URLs.
# jsonify: A function for converting Python dictionaries to JSON responses.

from sklearn.feature_extraction.text import TfidfVectorizer                                     # Imports the TfidfVectorizer again, which will be used within the Flask application to vectorize incoming text messages.
from sklearn.multiclass import *                                                # Imports the OneVsRestClassifier, which is a strategy for handling multiclass classification problems.
from sklearn.svm import *                                                       # Imports all modules from sklearn.svm. This is likely included to have the SVC available for use in the web application.
import pandas

In [18]:
app = Flask(__name__)                # This line creates an instance of the Flask web application. The __name__ argument tells Flask where to look for resources like templates and static files.
global Classifier                    # This line declares that the variable Classifier will be used as a global variable within this scope. This is necessary because the classifier object trained in the next cell will need to be accessible within the Flask route function.
global  Vectorizer                  # This line declares that the variable Vectorizer will be used as a global variable within this scope.

In [19]:
# train model
Classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True))        # This creates a new instance of the OneVsRestClassifier with an SVC base estimator, similar to what was done before, but this time with probability=True which allows the model to return probability estimates.
Vectorizer = TfidfVectorizer()                                                  # This creates a new instance of the TfidfVectorizer.
vectorize_text = Vectorizer.fit_transform(learn.v2)                             # This fits the new Vectorizer to the training data (learn.v2) and transforms it.
Classifier.fit(vectorize_text, learn.v1)

In [20]:
@app.route('/', methods=['GET'])                                      # This is a Flask decorator that defines a route for the application. It means that when a user accesses the root URL (/) of the web application using an HTTP GET request, the index() function below will be executed.

def index():                                                          # This defines the function that will handle requests to the root URL
    message = request.args.get('message', '')                         # This line retrieves the value of the 'message' query parameter from the request URL. If the parameter is not present, it defaults to an empty string.
    error = ''
    predict_proba = ''
    predict = ''                                                      # These lines initialize variables to store the prediction results and any potential errors.

    global Classifier
    global Vectorizer                                                 # These lines declare that the code will be using the global Classifier and Vectorizer variables that were trained in the previous cell.


    try:
        if len(message) > 0:                                          # This checks if the message is not empty.
          vectorize_message = Vectorizer.transform([message])         # If a message exists, it transforms the message into a numerical feature vector using the trained Vectorizer. Note the message is put in a list [message] because the transform method expects an iterable of strings.
          predict = Classifier.predict(vectorize_message)[0]          # This uses the trained Classifier to predict the label for the transformed message.
          predict_proba = Classifier.predict_proba(vectorize_message).tolist()   #  This calculates the probability of the message belonging to each class and converts the result to a list.
    except BaseException as inst:
        error = str(type(inst).__name__) + ' ' + str(inst)
    return jsonify(
              message=message, predict_proba=predict_proba,
              predict=predict, error=error)

if __name__ == '__main__':                                            # This is a standard Python construct that ensures the code inside this block only runs when the script is executed directly (not when it's imported as a module).
    port = int(os.environ.get('PORT', 5000))
    app.run(host='0.0.0.0', port=port, debug=True, use_reloader=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with watchdog (inotify)
ERROR:root:Unexpected exception finding object shape
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/google/colab/_debugpy_repr.py", line 54, in get_shape
    shape = getattr(obj, 'shape', None)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/werkzeug/local.py", line 318, in __get__
    obj = instance._get_current_object()
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/werkzeug/local.py", line 519, in _get_current_object
    raise RuntimeError(unbound_message) from None
RuntimeError: Working outside of request context.

This typically means that you attempted to use functionality that needed
an active HTTP request. Consult the documentation on testing for
information


***port*** = int(os.environ.get('PORT', 5000)):

This line gets the port number from the environment variable 'PORT' if it's set, otherwise it defaults to 5000.

**app.run(host='0.0.0.0', port=port, debug=True, use_reloader=True):** *italicized text*

This line starts the Flask development web server.

***host='0.0.0.0':***

Makes the server accessible from any IP address (useful in environments like Colab).

***port=port:***

Specifies the port to run on.

***debug=True:***

Enables debug mode, which provides more detailed error messages and automatically reloads the server when code changes are detected.

***use_reloader=True:***

Enables the reloader, which automatically restarts the server when code changes.