In [20]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error
import re
import nltk
from nltk.corpus import stopwords

In [21]:
# Load the training data
train_df = pd.read_csv("train.csv")

In [23]:
# drop unnecessary columns
train_df = train_df.drop(['PRODUCT_ID', 'PRODUCT_TYPE_ID'], axis=1)

In [24]:
train_df.head()

Unnamed: 0,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_LENGTH
0,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,2125.98
1,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,393.7
2,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",748.031495
3,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,787.401574
4,The United Empire Loyalists: A Chronicle of th...,,,598.424


In [25]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2249698 entries, 0 to 2249697
Data columns (total 4 columns):
 #   Column          Dtype  
---  ------          -----  
 0   TITLE           object 
 1   BULLET_POINTS   object 
 2   DESCRIPTION     object 
 3   PRODUCT_LENGTH  float64
dtypes: float64(1), object(3)
memory usage: 68.7+ MB


In [26]:
train_df.isnull().count()

TITLE             2249698
BULLET_POINTS     2249698
DESCRIPTION       2249698
PRODUCT_LENGTH    2249698
dtype: int64

In [28]:
# remove NaN values
train_df = train_df.dropna()

In [30]:
# lowercase all text columns
train_df['TITLE'] = train_df['TITLE'].str.lower()
train_df['BULLET_POINTS'] = train_df['BULLET_POINTS'].str.lower()
train_df['DESCRIPTION'] = train_df['DESCRIPTION'].str.lower()

In [32]:
# remove non-alphanumeric characters and stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
train_df['TITLE'] = train_df['TITLE'].apply(lambda x: re.sub(r'\W+', ' ', x))
train_df['TITLE'] = train_df['TITLE'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
train_df['BULLET_POINTS'] = train_df['BULLET_POINTS'].apply(lambda x: re.sub(r'\W+', ' ', x))
train_df['BULLET_POINTS'] = train_df['BULLET_POINTS'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
train_df['DESCRIPTION'] = train_df['DESCRIPTION'].apply(lambda x: re.sub(r'\W+', ' ', x))
train_df['DESCRIPTION'] = train_df['DESCRIPTION'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shankarlohar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
train_df.isnull().count()

TITLE             1038460
BULLET_POINTS     1038460
DESCRIPTION       1038460
PRODUCT_LENGTH    1038460
dtype: int64

In [36]:
# Fill any missing values in the DataFrame with empty strings
train_df.fillna("", inplace=True)

In [37]:
# Convert non-string data types to strings
train_df = train_df.applymap(lambda x: str(x) if type(x) != str else x)

In [38]:
# Extract the input features and target variable
X_train = train_df[["TITLE", "DESCRIPTION", "BULLET_POINTS"]]
y_train = train_df["PRODUCT_LENGTH"]

In [39]:
# Preprocess the text data using count vectorization
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train.apply(lambda x: " ".join(x), axis=1))

In [40]:
# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

Model Trained so far. Now use the Trained model on test data.

In [41]:
# Load the test data
test_df = pd.read_csv("test.csv")

In [42]:
# Fill any missing values in the DataFrame with empty strings
test_df = test_df.fillna("")

In [43]:
# Convert non-string data types to strings
test_df = test_df.applymap(lambda x: str(x) if type(x) != str else x)

In [44]:
# Extract the input features
X_test = test_df[["TITLE", "DESCRIPTION", "BULLET_POINTS", "PRODUCT_TYPE_ID"]]

In [45]:
# Preprocess the text data using count vectorization
X_test = vectorizer.transform(X_test.apply(lambda x: " ".join(x), axis=1))

In [46]:
# Make predictions on the test data
y_pred = model.predict(X_test)

In [47]:
# Create a submission DataFrame
submission_df = pd.DataFrame({"PRODUCT_ID": test_df["PRODUCT_ID"], "PRODUCT_LENGTH": y_pred})

In [48]:
# Save the submission DataFrame to a CSV file
submission_df.to_csv("submission2.csv", index=False)

In [50]:
submission_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734736 entries, 0 to 734735
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PRODUCT_ID      734736 non-null  object 
 1   PRODUCT_LENGTH  734736 non-null  float64
dtypes: float64(1), object(1)
memory usage: 11.2+ MB
