<a href="https://colab.research.google.com/github/rabinam24/Hands-on-Python-NLP/blob/main/Practical_assignment_vii.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.linear_model import Lasso, Ridge
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [2]:
# Load the California housing dataset
housing = fetch_california_housing()
X, y = housing.data, housing.target

In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Instantiate and train Lasso (L1) regression model
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
lasso_train_pred = lasso.predict(X_train)
lasso_test_pred = lasso.predict(X_test)

In [5]:
# Evaluate Lasso model
lasso_train_mse = mean_squared_error(y_train, lasso_train_pred)
lasso_test_mse = mean_squared_error(y_test, lasso_test_pred)
print("Lasso Train MSE:", lasso_train_mse)
print("Lasso Test MSE:", lasso_test_mse)

Lasso Train MSE: 0.60300014172392
Lasso Test MSE: 0.6135115198058131


In [6]:
# Instantiate and train Ridge (L2) regression model
ridge = Ridge(alpha=0.1)
ridge.fit(X_train, y_train)
ridge_train_pred = ridge.predict(X_train)
ridge_test_pred = ridge.predict(X_test)

In [7]:
# Evaluate Ridge model
ridge_train_mse = mean_squared_error(y_train, ridge_train_pred)
ridge_test_mse = mean_squared_error(y_test, ridge_test_pred)
print("Ridge Train MSE:", ridge_train_mse)
print("Ridge Test MSE:", ridge_test_mse)

Ridge Train MSE: 0.5179331264220425
Ridge Test MSE: 0.5558827543113783


In [8]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [11]:
train_data = open('/content/drive/MyDrive/training_data.txt', 'r+')
test_data = open('/content/drive/MyDrive/test_dataset.txt', 'r+')

train = pd.DataFrame(train_data.readlines(), columns = ['Question'])
test = pd.DataFrame(test_data.readlines(), columns = ['Question'])

In [12]:
train.head()

Unnamed: 0,Question
0,DESC:manner How did serfdom develop in and the...
1,ENTY:cremat What films featured the character ...
2,DESC:manner How can I find a list of celebriti...
3,ENTY:animal What fowl grabs the spotlight afte...
4,ABBR:exp What is the full form of .com ?\n


In [13]:
train['QType'] = train.Question.apply(lambda x: x.split(' ', 1)[0])
train['Question'] = train.Question.apply(lambda x: x.split(' ', 1)[1])
train['QType-Coarse'] = train.QType.apply(lambda x: x.split(':')[0])
train['QType-Fine'] = train.QType.apply(lambda x: x.split(':')[1])
test['QType'] = test.Question.apply(lambda x: x.split(' ', 1)[0])
test['Question'] = test.Question.apply(lambda x: x.split(' ', 1)[1])
test['QType-Coarse'] = test.QType.apply(lambda x: x.split(':')[0])
test['QType-Fine'] = test.QType.apply(lambda x: x.split(':')[1])

In [14]:
train.head()

Unnamed: 0,Question,QType,QType-Coarse,QType-Fine
0,How did serfdom develop in and then leave Russ...,DESC:manner,DESC,manner
1,What films featured the character Popeye Doyle...,ENTY:cremat,ENTY,cremat
2,How can I find a list of celebrities ' real na...,DESC:manner,DESC,manner
3,What fowl grabs the spotlight after the Chines...,ENTY:animal,ENTY,animal
4,What is the full form of .com ?\n,ABBR:exp,ABBR,exp


In [15]:
train.pop('QType')
train.pop('QType-Fine')
test.pop('QType')
test.pop('QType-Fine')

0           dist
1           city
2           desc
3            def
4           date
         ...    
495          ind
496     currency
497        count
498    substance
499          def
Name: QType-Fine, Length: 500, dtype: object

In [16]:
classes = np.unique(np.array(train['QType-Coarse']))
classes

array(['ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM'], dtype=object)

In [17]:
label = LabelEncoder()
label.fit(pd.Series(train['QType-Coarse'].tolist() + test['QType-Coarse'].tolist()).values)
train['QType-Coarse'] = label.transform(train['QType-Coarse'].values)
test['QType-Coarse'] = label.transform(test['QType-Coarse'].values)

In [18]:
all_corpus = pd.Series(train.Question.tolist() + test.Question.tolist()).astype(str)

In [19]:
def preprocess(corpus, remove_stopwords=True):
    '''
    Function to preprocess the text corpus

    Input :
        corpus : Text data corpus
        remove_stopwords : Boolean, True if stopwords need to be removed

    Output : Returns the processed text corpus
    '''
    cleaned_corpus = []  # Create an empty list to store processed text

    for row in corpus:
        qs = []
        for word in row.split():
            p1 = re.sub(pattern='[^a-zA-Z]', repl=' ', string=word)
            p1 = p1.lower()
            qs.append(p1)
        cleaned_corpus.append(' '.join(qs))  # Append the processed text as a single string to the list

    cleaned_corpus = pd.Series(cleaned_corpus)  # Convert the list to a Pandas Series
    return cleaned_corpus

In [20]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word)
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [21]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [22]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [28]:
def preprocess(corpus, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):

    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)

    Input :
    'corpus' - Text corpus on which pre-processing tasks will be performed

    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer

    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together

    Output : Returns the processed text corpus

    '''
    # if cleaning == True:
        # corpus = text_clean(corpus)

    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]

    if lemmatization == True:
        corpus = lemmatize(corpus)


    if stemming == True:
        corpus = stem(corpus, stem_type)

    corpus = [' '.join(x) for x in corpus]


    return corpus

In [29]:
all_corpus = preprocess(all_corpus, remove_stopwords = True)

In [30]:
train_corpus = all_corpus[0:train.shape[0]]
test_corpus = all_corpus[train.shape[0]:]

In [31]:
vectorizer = TfidfVectorizer()
tf_idf_matrix_train = vectorizer.fit_transform(train_corpus)

In [32]:
tf_idf_matrix_test = vectorizer.transform(test_corpus)

In [33]:
import keras
from keras.models import Sequential, Model
from keras import layers
from keras.layers import Dense, Dropout, Input
from tensorflow.keras.utils import to_categorical

In [33]:
y_train = to_categorical(train['QType-Coarse'], train['QType-Coarse'].nunique())
y_test = to_categorical(test['QType-Coarse'], test['QType-Coarse'].nunique())

In [34]:
model = Sequential()
# Add the first layer with input shape
model.add(Dense(units=128, activation='relu', input_shape=(tf_idf_matrix_train.shape[1],)))
model.add(Dropout(0.3))
model.add(Dense(units=6, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               1069440   
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 6)                 774       
                                                                 
Total params: 1070214 (4.08 MB)
Trainable params: 1070214 (4.08 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [50]:
tf_idf_matrix_train_dense = tf_idf_matrix_train.toarray()[:y_train.shape[0]]

In [52]:
n_samples = min(tf_idf_matrix_train_dense.shape[0], y_train.shape[0])
tf_idf_matrix_train_dense = tf_idf_matrix_train_dense[:n_samples]
y_train = y_train[:n_samples]

In [55]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])
training_history = model.fit(tf_idf_matrix_train_dense, tf.keras.utils.to_categorical(y_train), epochs=10, batch_size=100)

Epoch 1/10


  output, from_logits = _get_logits(


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [58]:
# If tf_idf_matrix_test has fewer samples than y_test:
tf_idf_matrix_test = tf_idf_matrix_test.toarray()
tf_idf_matrix_test = np.concatenate((tf_idf_matrix_test, np.zeros((y_test.shape[0] - tf_idf_matrix_test.shape[0], tf_idf_matrix_test.shape[1]))), axis=0)

# If y_test has fewer samples than tf_idf_matrix_test:
y_test = np.concatenate((y_test, np.zeros((tf_idf_matrix_test.shape[0] - y_test.shape[0],))), axis=0)

In [60]:
y_test_onehot = tf.keras.utils.to_categorical(y_test)

In [61]:
loss, accuracy = model.evaluate(tf_idf_matrix_test, y_test_onehot, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))

  output, from_logits = _get_logits(


Testing Accuracy: 0.3949
