In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
# imports
import pandas as pd
import warnings
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")

In [14]:
trainingSet = pd.read_csv("/content/gdrive/MyDrive/Notes/Shivangi/CS 506 (TDS)/Midterm/data/X_train.csv")
testingSet = pd.read_csv("/content/gdrive/MyDrive/Notes/Shivangi/CS 506 (TDS)/Midterm/data/X_test.csv")

In [15]:
trainingSet.fillna(value='', inplace=True)

In [16]:
sample_data_some = trainingSet.iloc[:40000]
sample_data_some = sample_data_some.sample(frac = 1, random_state = 0)
sample_data_some = sample_data_some.reset_index(drop = True)

In [17]:
# appending testing set to sampled training set to get same sized tfidf vectors
data = pd.concat([sample_data_some, testingSet], axis=0)
data.shape

(53976, 11)

In [18]:
word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range = (1,3))
character_vectorizer = TfidfVectorizer(analyzer='char', ngram_range = (2,4))

In [19]:
text_vector = word_vectorizer.fit_transform(data['Text'])
summ_vector = character_vectorizer.fit_transform(data['Summary'])

In [20]:
# splitting again into test and train sets
text_vector_train = text_vector[:40000,:]
text_vector_test = text_vector[40000:,:]

summ_vector_train = summ_vector[:40000,:]
summ_vector_test = summ_vector[40000:,:]

In [21]:
from scipy.sparse import hstack

tfidf_train = hstack([text_vector_train, summ_vector_train])
tfidf_test = hstack([text_vector_test, summ_vector_test])

In [22]:
import lightgbm as lgb

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_train, sample_data_some['Score'][:40000], test_size=0.2, random_state=0)

params = {
    'objective': 'regression', 
    'metric': 'rmse',
    'num_leaves': 30,
    'learning_rate': 0.06,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
}

model = lgb.LGBMRegressor(**params)

model.fit(X_train, y_train, eval_set = [(X_test, y_test)], early_stopping_rounds=10, eval_metric='rmse')

[1]	valid_0's rmse: 1.15834
[2]	valid_0's rmse: 1.14529
[3]	valid_0's rmse: 1.13298
[4]	valid_0's rmse: 1.12209
[5]	valid_0's rmse: 1.11248
[6]	valid_0's rmse: 1.10202
[7]	valid_0's rmse: 1.09377
[8]	valid_0's rmse: 1.08514
[9]	valid_0's rmse: 1.07801
[10]	valid_0's rmse: 1.07065
[11]	valid_0's rmse: 1.06372
[12]	valid_0's rmse: 1.0565
[13]	valid_0's rmse: 1.05018
[14]	valid_0's rmse: 1.04387
[15]	valid_0's rmse: 1.03797
[16]	valid_0's rmse: 1.03222
[17]	valid_0's rmse: 1.02734
[18]	valid_0's rmse: 1.02206
[19]	valid_0's rmse: 1.01641
[20]	valid_0's rmse: 1.01281
[21]	valid_0's rmse: 1.00869
[22]	valid_0's rmse: 1.0046
[23]	valid_0's rmse: 1.00099
[24]	valid_0's rmse: 0.997411
[25]	valid_0's rmse: 0.994233
[26]	valid_0's rmse: 0.989761
[27]	valid_0's rmse: 0.98548
[28]	valid_0's rmse: 0.981617
[29]	valid_0's rmse: 0.977626
[30]	valid_0's rmse: 0.974016
[31]	valid_0's rmse: 0.970683
[32]	valid_0's rmse: 0.966941
[33]	valid_0's rmse: 0.963991
[34]	valid_0's rmse: 0.960536
[35]	valid_0's 

In [23]:
testingSet['Score'] = model.predict(tfidf_test)

In [24]:
submission = testingSet[['Id', 'Score']]

In [38]:
submission.to_csv("/content/gdrive/MyDrive/Notes/Shivangi/CS 506 (TDS)/Midterm/data/LightGBM(40K)_result.csv", index=False)

In [41]:
import pickle

filename = '/content/gdrive/MyDrive/Notes/Shivangi/CS 506 (TDS)/Midterm/Models/LightGBM(40K).obj'
with open(filename, 'wb') as file:
    pickle.dump(model, file)