# Creating the Machine Learning models

## Importing the data from SQL Database 

In [13]:
%env USER = user1
%env PASSWD = #password99

env: USER=user1
env: PASSWD=#password99


In [14]:
from mysql.connector import connect, Error

import pandas as pd

# importing OS because of the enviourment variable

import os

USER = os.environ['USER']

PASSWD = os.environ['PASSWD']

In [15]:
def db_connection(host_name, user_name, user_password, db_name):
    try:
        
        conn = connect(host=host_name, user=user_name, password=user_password, database=db_name)
        
    except Error as e:
        
        print(e)
    
    else:
        
        print(f'Connection to {db_name} is successfully!')
    
    return conn

In [28]:
def retrive_data():
    
    try:
        
        df = pd.read_sql_query('SELECT * FROM news_data_clean;', conn)
        
    except Error as e:
        
        print(e)
    
    return df

In [17]:
conn = db_connection(host_name = 'localhost', user_name = USER, user_password = PASSWD, db_name = 'news_data')

Connection to news_data is successfully!


In [18]:
df = pd.DataFrame(columns=['category', 'description'])

In [19]:
df = retrive_data()

      category                                        description
0      general  trusted,source,breaking,news,analysis,exclusiv...
1      general  australias,trusted,source,local,national,world...
2      general  news,analysis,middle,east,worldwide,multimedia...
3   technology  pc,enthusiasts,resource,power,users,tools,love...
4      general  ap,delivers,indepth,coverage,international,pol...
..         ...                                                ...
75     general  washington,times,delivers,breaking,news,commen...
76     general  breaking,news,analysis,timecom,politics,world,...
77     general  get,latest,national,international,political,ne...
78     general  vice,news,vice,media,incs,current,affairs,chan...
79  technology  wired,monthly,american,magazine,published,prin...

[80 rows x 2 columns]


In [20]:
df.head()

Unnamed: 0,category,description
0,general,"trusted,source,breaking,news,analysis,exclusiv..."
1,general,"australias,trusted,source,local,national,world..."
2,general,"news,analysis,middle,east,worldwide,multimedia..."
3,technology,"pc,enthusiasts,resource,power,users,tools,love..."
4,general,"ap,delivers,indepth,coverage,international,pol..."


In [29]:
df.count()

category       80
description    80
dtype: int64

# Building the Machine Learning models

## Importing the maachine learning libraries 

In [21]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.naive_bayes import MultinomialNB

# to convert the text into vectors

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix

## Converting the text to into form of a Vector

In [22]:
tfidf_vect = TfidfVectorizer(analyzer='word')

x_tfidf = tfidf_vect.fit_transform(df['description'])

x_features = pd.DataFrame(x_tfidf.toarray())

In [23]:
# splitting the data set to training and testing data set

x_train, x_test, y_train, y_test = train_test_split(x_features, df['category'], test_size=30)

## Decision Tree Classification model

In [24]:
dtc = DecisionTreeClassifier(max_depth=500)

dt_model = dtc.fit(x_train, y_train)

pred = dt_model.predict(x_test)

print('Accuracy score :- {}'.format(round(accuracy_score(y_test, pred), 2)))

print(format(classification_report(y_test, pred,zero_division=1)))

Accuracy score :- 0.5
               precision    recall  f1-score   support

     business       0.00      0.00      0.00         2
entertainment       1.00      0.00      0.00         5
      general       0.57      0.81      0.67        16
       health       1.00      0.00      0.00         1
      science       1.00      0.00      0.00         1
       sports       1.00      0.50      0.67         2
   technology       0.25      0.33      0.29         3

     accuracy                           0.50        30
    macro avg       0.69      0.24      0.23        30
 weighted avg       0.63      0.50      0.43        30



## K Neighbors Classification model

In [25]:
dtc = KNeighborsClassifier()

dt_model = dtc.fit(x_train, y_train)

pred = dt_model.predict(x_test)

print('Accuracy score :- {}'.format(round(accuracy_score(y_test, pred), 2)))

print(format(classification_report(y_test, pred,zero_division=1)))

Accuracy score :- 0.6
               precision    recall  f1-score   support

     business       0.00      0.00      0.00         2
entertainment       0.50      0.20      0.29         5
      general       0.60      0.94      0.73        16
       health       1.00      0.00      0.00         1
      science       1.00      0.00      0.00         1
       sports       1.00      1.00      1.00         2
   technology       1.00      0.00      0.00         3

     accuracy                           0.60        30
    macro avg       0.73      0.31      0.29        30
 weighted avg       0.64      0.60      0.50        30



## Random Forest Classification model

In [26]:
dtc = RandomForestClassifier()

dt_model = dtc.fit(x_train, y_train)

pred = dt_model.predict(x_test)

print('Accuracy score :- {}'.format(round(accuracy_score(y_test, pred), 2)))

print(format(classification_report(y_test, pred,zero_division=1)))

Accuracy score :- 0.57
               precision    recall  f1-score   support

     business       1.00      0.00      0.00         2
entertainment       1.00      0.00      0.00         5
      general       0.55      1.00      0.71        16
       health       1.00      0.00      0.00         1
      science       1.00      0.00      0.00         1
       sports       1.00      0.50      0.67         2
   technology       1.00      0.00      0.00         3

     accuracy                           0.57        30
    macro avg       0.94      0.21      0.20        30
 weighted avg       0.76      0.57      0.42        30



## Multinomial NB Classification

In [27]:
dtc = MultinomialNB()

dt_model = dtc.fit(x_train, y_train)

pred = dt_model.predict(x_test)

print('Accuracy score :- {}'.format(round(accuracy_score(y_test, pred), 2)))

print(format(classification_report(y_test, pred,zero_division=1)))

Accuracy score :- 0.53
               precision    recall  f1-score   support

     business       1.00      0.00      0.00         2
entertainment       1.00      0.00      0.00         5
      general       0.53      1.00      0.70        16
       health       1.00      0.00      0.00         1
      science       1.00      0.00      0.00         1
       sports       1.00      0.00      0.00         2
   technology       1.00      0.00      0.00         3

     accuracy                           0.53        30
    macro avg       0.93      0.14      0.10        30
 weighted avg       0.75      0.53      0.37        30

