# importing the libraries

In [None]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import linear_model
from sklearn import metrics

## given datasets

In [None]:
ls ../input/nlp-getting-started/

# importing the dataset

In [None]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
train_df.head()

# K-Fold

In [None]:
from sklearn import model_selection
def fold(df):
    df['kfold'] = -1
    df = df.sample(frac=0.1).reset_index(drop=True)
    y = df.target.values
    kf = model_selection.StratifiedKFold(n_splits=5)
    for f_,(t_,v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = f_
    return df

In [None]:
df = fold(train_df)
df.head()

# Training Logistic Regression

In [None]:
def train(fold):
    
    # splitting the dataset into training and validation dataset
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    
    # Initializing count vectorizer with word_tokenize as a tokenizer
    cv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
    cv.fit(train_df.text)
    
    # trainforming training and validation dataset
    x_train = cv.transform(train_df.text)
    x_valid = cv.transform(valid_df.text)

    # initializing logistic regression
    model = linear_model.LogisticRegression()
    
    # fitting the modelo
    model.fit(x_train, train_df.target)
    
    #prediction on validation set
    preds = model.predict(x_valid)
    
    # Accuracy score
    accuracy = metrics.accuracy_score(preds, valid_df.target)
    
    print(f"Accuracy:{accuracy}")
    print("")

In [None]:
for fold in range(5):
    train(fold)

# Training Neive Baise

In [None]:
from sklearn import naive_bayes

def train(fold):
    
    # splitting the dataset into training and validation dataset
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    
    # Initializing count vectorizer with word_tokenize as a tokenizer
    cv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
    cv.fit(train_df.text)
    
    # trainforming training and validation dataset
    x_train = cv.transform(train_df.text)
    x_valid = cv.transform(valid_df.text)

    # initializing logistic regression
    model = naive_bayes.MultinomialNB()
    
    # fitting the modelo
    model.fit(x_train, train_df.target)
    
    #prediction on validation set
    preds = model.predict(x_valid)
    
    # Accuracy score
    accuracy = metrics.accuracy_score(preds, valid_df.target)
    
    print(f"Accuracy:{accuracy}")
    print("")

In [None]:
for fold in range(5):
    train(fold)

# Logistic Regressin with TF-IDF

In [None]:
def train(fold):
    
    # splitting the dataset into training and validation dataset
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    
    # Initializing count vectorizer with word_tokenize as a tokenizer
    cv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None, ngram_range=(1, 4))
    cv.fit(train_df.text)
    
    # trainforming training and validation dataset
    x_train = cv.transform(train_df.text)
    x_valid = cv.transform(valid_df.text)

    # initializing logistic regression
    model = linear_model.LogisticRegression()
    
    # fitting the modelo
    model.fit(x_train, train_df.target)
    
    #prediction on validation set
    preds = model.predict(x_valid)
    
    # Accuracy score
    accuracy = metrics.accuracy_score(preds, valid_df.target)
    
    print(f"Accuracy:{accuracy}")
    print("")

In [None]:
for fold in range(5):
    train(fold)