In [15]:
import numpy as np
import pandas as pd
import re
import emoji
import autocorrect 
from textblob import TextBlob 
from warnings import filterwarnings
filterwarnings("ignore")
import nltk
import joblib
from joblib import Memory
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer,LancasterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score,ConfusionMatrixDisplay,recall_score,precision_score

In [83]:
#using clean data 

data = pd.read_csv("clean_data.csv")

In [85]:
data.head(3)

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings,Emotion
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,nice product good qualiti price rise bad sign ...,4,Positive
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,suppli yonex mavi outsid cover yonex ad insid ...,1,Negative
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,worst product damag shuttlecock pack new box o...,1,Negative


In [None]:
`

### Identify Input and Output

In [86]:
X=data['Review text']
y=data['Emotion']

In [87]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

### Mlflow task

In [108]:
import mlflow
import mlflow.sklearn
mlflow.set_experiment('ML_project')

<Experiment: artifact_location='file:///C:/Users/RATNADEEP/Desktop/backend/yonex/mlruns/896545237442633190', creation_time=1711507013306, experiment_id='896545237442633190', last_update_time=1711507013306, lifecycle_stage='active', name='ML_project', tags={}>

In [109]:
import joblib
from joblib import Memory

import os

In [113]:
cachedir = '.cache'
memory = Memory(location=cachedir, verbose=0)

pipelines = {
    'logistic_regression': Pipeline([
        ('vectorization',TfidfVectorizer()),
        ('classifier', LogisticRegression())
    ], memory=memory),
    'naive_bayes': Pipeline([
        ('vectorization',TfidfVectorizer()),
        ('classifier', MultinomialNB())
    ], memory=memory),
    'knn': Pipeline([
        ('vectorization',TfidfVectorizer()),
        ('classifier',  KNeighborsClassifier())
    ], memory=memory)
    
}


from sklearn.feature_extraction.text import CountVectorizer

# Defining parameter grids
param_grids = {
    'logistic_regression': {
        'vectorization': [CountVectorizer(), TfidfVectorizer()],
        'vectorization__max_features': [1000, 1500, 2000, 5000],
        'classifier__C': [0.1, 1.0, 10.0]
    },
    'naive_bayes': {
        'vectorization': [CountVectorizer()],
        'vectorization__max_features': [1000, 1500, 2000, 5000],
        'classifier__alpha': [1, 10]
    },
    'knn': {
        'vectorization': [CountVectorizer(), TfidfVectorizer()],
        'vectorization__max_features': [1000, 1500, 2000, 5000],
        'classifier__n_neighbors': [3, 5, 7, 9]
    }
}


from sklearn.model_selection import GridSearchCV

# Perform GridSearchCV for each algorithm
best_models = {}

for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1
                              )
    
    # Fit grid search
    #%time grid_search.fit(X_train, y_train)
    mlflow.sklearn.autolog(max_tuning_runs=None)
    
    with mlflow.start_run() as run:
        %time grid_search.fit(X_train, y_train)
    
    
 
    print('Score on Test Data: ', grid_search.score(X_test, y_test))
    
    print('Train Score: ', grid_search.best_score_)
    
    # Store best model
    best_models[algo] = grid_search.best_estimator_
    
    print()


********** logistic_regression **********




Fitting 5 folds for each of 24 candidates, totalling 120 fits
CPU times: total: 2min 13s
Wall time: 3min 12s
Score on Test Data:  0.9882491186839013
Train Score:  0.9844268865965144

********** naive_bayes **********




Fitting 5 folds for each of 8 candidates, totalling 40 fits
CPU times: total: 28.9 s
Wall time: 51.7 s
Score on Test Data:  0.918918918918919
Train Score:  0.9241828965270203

********** knn **********




Fitting 5 folds for each of 32 candidates, totalling 160 fits
CPU times: total: 8min 38s
Wall time: 9min 18s
Score on Test Data:  0.9435957696827262
Train Score:  0.9334413345978436



In [1]:
from prefect import task, flow

In [25]:
@task
def load_data(file_path):
    """
    Load data from a CSV file.
    
    Args:
    - file_path (str): Path to the CSV file.
    
    Returns:
    - DataFrame: Loaded DataFrame.
    """
    return pd.read_csv(file_path)



@task
def split_inputs_output(data, inputs, output):
    """
    Split features and target variables.
    """
    X = data[inputs]
    y = data[output]
    return X, y

@task
def split_train_test(X, y, test_size=0.20, random_state=42):
    """
    Split data into train and test sets.
    """
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

@task
def preprocess_data(X_train, X_test, y_train, y_test):
    """
    Preprocess the data.
    """
    count = CountVectorizer()
    X_train_vec = count.fit_transform(X_train)
    X_test_vec = count.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

@task
def train_model(X_train_vec, y_train, hyperparameters):
    """
    Training the KNN model.
    """

    clf = KNeighborsClassifier(**hyperparameters)
    
    clf.fit(X_train_vec, y_train)
    return clf

@task
def model_evaluation(model, X_train_vec,y_train, X_test_vec, y_test):
    """
    Evaluating the model.
    """
    y_train_pred = model.predict(X_train_vec)
    y_test_pred = model.predict(X_test_vec)
    
    train_score = metrics.accuracy_score(y_train, y_train_pred)
    test_score = metrics.accuracy_score(y_test, y_test_pred)
    
    return train_score, test_score


In [26]:
#workflow
@flow(name="KNN Training Flow")
def workflow():
    DATA_PATH = "Clean_data.csv" 
    INPUTS = 'Review text' 
    OUTPUT = 'Emotion'
    HYPERPARAMETERS = {'n_neighbors': 3, 'p': 2} 
    
    data = load_data(DATA_PATH)

    # Identify Inputs and Output
    X, y = split_inputs_output(data, INPUTS, OUTPUT)

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = split_train_test(X, y)

    # Preprocess the data
    X_train_vec, X_test_vec, y_train, y_test = preprocess_data(X_train, X_test, y_train, y_test)

    # Build a model
    model = train_model(X_train_vec, y_train, HYPERPARAMETERS)
    
    # Evaluation
    train_score, test_score = model_evaluation(model, X_train_vec, y_train, X_test_vec, y_test)
    
    print("Train Score:", train_score)
    print("Test Score:", test_score)




In [27]:
if __name__ == "__main__":
    workflow()


Train Score: 0.9490155744930944
Test Score: 0.927144535840188


In [30]:
if __name__ == "__main__":
    workflow.serve(
        name="my-first-deployment",
        cron="0 * * * *"
    )

In [None]:
#mlflow.set_experiment("Ml_project")

In [None]:
## Creating pickel file

In [None]:
#import pickle

In [None]:
#pickle.dump(lr_pipe,open("senti.pkl",'wb'))

In [None]:
#predict = pickle.load(open("senti.pkl",'rb'))