# Creating the Machine Learning models

## Importing the data from SQL Database 

In [2]:
from mysql.connector import connect, Error

import pandas as pd

# importing OS because of the enviourment variable

import os

USER = os.environ['USER']

PASSWD = os.environ['PASSWD']

In [3]:
def db_connection(host_name, user_name, user_password, db_name):
    try:
        
        conn = connect(host=host_name, user=user_name, password=user_password, database=db_name)
        
    except Error as e:
        
        print(e)
    
    else:
        
        print(f'Connection to {db_name} is successfully!')
    
    return conn

In [4]:
def retrive_data():
    
    try:
        
        df = pd.read_sql_query('SELECT * FROM news_data_clean;', conn)
        
    except Error as e:
        
        print(e)
    
    return df

In [5]:
conn = db_connection(host_name = 'localhost', user_name = USER, user_password = PASSWD, db_name = 'news_data')

Connection to news_data is successfully!


In [6]:
df = pd.DataFrame(columns=['category', 'description'])

In [7]:
df = retrive_data()

In [8]:
df.head()

Unnamed: 0,category,description
0,general,"trusted,source,breaking,news,analysis,exclusiv..."
1,general,"australias,trusted,source,local,national,world..."
2,general,"news,analysis,middle,east,worldwide,multimedia..."
3,technology,"pc,enthusiasts,resource,power,users,tools,love..."
4,general,"ap,delivers,indepth,coverage,international,pol..."


In [9]:
df.count()

category       87
description    87
dtype: int64

# Building the Machine Learning models

## Importing the maachine learning libraries 

In [10]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.naive_bayes import MultinomialNB

# to convert the text into vectors

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix

## Converting the text to into form of a Vector

In [11]:
tfidf_vect = TfidfVectorizer(analyzer='word')

x_tfidf = tfidf_vect.fit_transform(df['description'])

x_features = pd.DataFrame(x_tfidf.toarray())

In [12]:
# splitting the data set to training and testing data set

x_train, x_test, y_train, y_test = train_test_split(x_features, df['category'], test_size=30)

## Decision Tree Classification model

In [13]:
dtc = DecisionTreeClassifier(max_depth=500)

dt_model = dtc.fit(x_train, y_train)

pred = dt_model.predict(x_test)

print('Accuracy score :- {}'.format(round(accuracy_score(y_test, pred), 2)))

print(format(classification_report(y_test, pred,zero_division=1)))

Accuracy score :- 0.7
               precision    recall  f1-score   support

     business       0.00      0.00      0.00         1
entertainment       0.80      0.80      0.80         5
      general       0.70      0.94      0.80        17
      science       1.00      0.00      0.00         1
       sports       1.00      0.25      0.40         4
   technology       1.00      0.00      0.00         2

     accuracy                           0.70        30
    macro avg       0.75      0.33      0.33        30
 weighted avg       0.76      0.70      0.64        30



## K Neighbors Classification model

In [14]:
dtc = KNeighborsClassifier()

dt_model = dtc.fit(x_train, y_train)

pred = dt_model.predict(x_test)

print('Accuracy score :- {}'.format(round(accuracy_score(y_test, pred), 2)))

print(format(classification_report(y_test, pred,zero_division=1)))

Accuracy score :- 0.67
               precision    recall  f1-score   support

     business       0.33      1.00      0.50         1
entertainment       0.67      0.40      0.50         5
      general       0.70      0.94      0.80        17
      science       1.00      0.00      0.00         1
       sports       1.00      0.25      0.40         4
   technology       1.00      0.00      0.00         2

     accuracy                           0.67        30
    macro avg       0.78      0.43      0.37        30
 weighted avg       0.75      0.67      0.61        30



## Random Forest Classification model

In [15]:
dtc = RandomForestClassifier()

dt_model = dtc.fit(x_train, y_train)

pred = dt_model.predict(x_test)

print('Accuracy score :- {}'.format(round(accuracy_score(y_test, pred), 2)))

print(format(classification_report(y_test, pred,zero_division=1)))

Accuracy score :- 0.7
               precision    recall  f1-score   support

     business       1.00      1.00      1.00         1
entertainment       1.00      0.40      0.57         5
      general       0.65      1.00      0.79        17
      science       1.00      0.00      0.00         1
       sports       1.00      0.25      0.40         4
   technology       1.00      0.00      0.00         2

     accuracy                           0.70        30
    macro avg       0.94      0.44      0.46        30
 weighted avg       0.80      0.70      0.63        30



## Multinomial NB Classification

In [16]:
dtc = MultinomialNB()

dt_model = dtc.fit(x_train, y_train)

pred = dt_model.predict(x_test)

print('Accuracy score :- {}'.format(round(accuracy_score(y_test, pred), 2)))

print(format(classification_report(y_test, pred,zero_division=1)))

Accuracy score :- 0.57
               precision    recall  f1-score   support

     business       1.00      0.00      0.00         1
entertainment       1.00      0.00      0.00         5
      general       0.57      1.00      0.72        17
      science       1.00      0.00      0.00         1
       sports       1.00      0.00      0.00         4
   technology       1.00      0.00      0.00         2

     accuracy                           0.57        30
    macro avg       0.93      0.17      0.12        30
 weighted avg       0.75      0.57      0.41        30

