In [None]:
import pandas as pd
import numpy as np
import scipy

In [None]:
from sklearn.linear_model import Ridge,LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer

In [None]:
#loading data
train = pd.read_csv('../input/train.tsv', sep = '\t')
test = pd.read_csv('../input/test.tsv', sep = '\t')


In [None]:
NUM_BRANDS = 2500
NAME_MIN_DF = 10
MAX_FEAT_DESCP = 50000

In [None]:
#train.head(3)
train.brand_name.value_counts()

In [None]:
print(test.head(3))
print (test.shape)

In [None]:
#concatenating test and train into a single dataset
dt = pd.concat([train,test],0)


In [None]:
dt.head(3)


In [None]:
dt.brand_name.value_counts()

In [None]:
#this gives the number of rows in the training set
nrow_train = train.shape[0]
#print (nrow_train)


In [None]:
#converting the target variable "price" into it's log values for better fit
target = np.log1p(train.price)
target

In [None]:
#filling names and missing values in category_name
dt.category_name = dt.category_name.fillna('Other').astype('category')
dt.brand_name = dt.brand_name.fillna('unknown')

In [None]:
dt.isnull().sum()

In [None]:
pop_brands = dt.brand_name.value_counts().index[:NUM_BRANDS]

In [None]:
pop_brands

In [None]:
dt.loc[~dt.brand_name.isin(pop_brands), 'brand_name'] = 'Other'

In [None]:
dt.head(3)

In [None]:
dt.item_description = dt.item_description.fillna('None')
dt.item_condition_id = dt.item_condition_id.astype('category')
dt.brand_name = dt.brand_name.astype('category')


In [None]:
#encoding name
count = CountVectorizer(min_df = NAME_MIN_DF)
x_name = count.fit_transform(dt.name)

In [None]:
x_name

In [None]:
from scipy.sparse import find

In [None]:
find(x_name)

In [None]:
#encoding category variables
#spliting data info in category_name via split('/)
unique_categories = pd.Series('/'.join(dt.category_name.unique().astype('str')).split('/')).unique()

In [None]:
unique_categories.take(10)

In [None]:
count_category = CountVectorizer()

In [None]:
x_category = count_category.fit_transform(dt.category_name)

In [None]:
#item_description
count_desc = TfidfVectorizer(max_features = MAX_FEAT_DESCP,
                            ngram_range = (1,3),
                            stop_words = 'english')


In [None]:
x_descp = count_desc.fit_transform(dt.item_description)

In [None]:
#brand_name encoder
vect_brand = LabelBinarizer(sparse_output = True)
x_brand = vect_brand.fit_transform(dt.brand_name)

In [None]:
#dummy encoders
x_dummies = scipy.sparse.csr_matrix(pd.get_dummies(dt[['item_condition_id','shipping']],sparse = True).values)

In [None]:
X = scipy.sparse.hstack((x_dummies,
                        x_descp,
                        x_brand,
                        x_category,
                        x_name)).tocsr()

In [None]:
print (x_dummies.shape, x_category.shape, x_name.shape,x_descp.shape,x_brand.shape)

In [None]:
x_train = X[:nrow_train]

# RIDGE

In [None]:
clf = Ridge(solver = 'lsqr',fit_intercept = False)

In [None]:
y_train = target

In [None]:
#fitting classifier
clf.fit(x_train,y_train)

In [None]:
x_test = X[nrow_train:]

In [None]:
preds = clf.predict(x_test)

In [None]:
test["price"] = np.expm1(preds)
test[['test_id','price']].to_csv('ridge_clf.csv',index = False)