In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.stats import skew, skewtest, norm
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import time
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import LabelBinarizer
from scipy import sparse

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import preprocessing, metrics
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Ridge, Lasso, SGDRegressor
from sklearn.metrics import  make_scorer,  mean_squared_error
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline


In [None]:
#train_data = pd.read_csv('train.tsv', sep = "\t")   - local usage
#test_data = pd.read_csv('test.tsv', sep='\t')   - local usage
train_data = pd.read_csv('../input/train.tsv', sep='\t')
test_data = pd.read_csv('../input/test.tsv', sep='\t')

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
sns.distplot(train_data['price'], bins = 20, fit = norm)

In [None]:
sns.set(style="white", palette="muted", color_codes=True)
f, axes = plt.subplots(1, 1, figsize=(11, 7), sharex=True)
sns.despine(left=True)
sns.distplot(np.log(train_data['price'].values+1), axlabel = 'Log(price)', label = 'log(trip_duration)', bins = 50, color="y")
plt.setp(axes, yticks=[])
plt.tight_layout()
plt.show()

In [None]:
train_data['price'] = np.log1p(train_data['price'])
train_data['shipping'] = np.log1p(train_data['shipping'])
test_data['shipping'] = np.log1p(test_data['shipping'])

In [None]:
y = train_data['price']

In [None]:
def if_catname(row):
    
    """function to give if category, brand or description name is there or not"""
    if row == row:
        return 0
    else:
        return 1
    
train_data['Category_missing'] = train_data.category_name.apply(lambda row : if_catname(row))
train_data['Brand_missing'] = train_data.brand_name.apply(lambda row : if_catname(row))
train_data['Item_missing'] = train_data.item_description.apply(lambda row : if_catname(row))
test_data['Category_missing'] = test_data.category_name.apply(lambda row : if_catname(row))
test_data['Brand_missing'] = test_data.brand_name.apply(lambda row : if_catname(row))
test_data['Item_missing'] = test_data.item_description.apply(lambda row : if_catname(row))



In [None]:
train_data.head()

In [None]:
train_data["category_name"].fillna("None/None/None", inplace=True)
test_data["category_name"].fillna("None/None/None", inplace=True)

train_data["brand_name"].fillna("None", inplace=True)
test_data["brand_name"].fillna("None", inplace=True)

train_data["item_description"].fillna("None", inplace=True)
test_data["item_description"].fillna("None", inplace=True)


In [None]:
train_cat = train_data['category_name'].str.split('/', expand=True).reindex(columns=np.arange(3)).add_prefix('L')
test_cat = test_data['category_name'].str.split('/', expand=True).reindex(columns=np.arange(3)).add_prefix('L')

In [None]:
train_cat.head()

In [None]:
train_data = pd.concat([train_data, train_cat], axis=1)
test_data = pd.concat([test_data, test_cat], axis=1)

In [None]:
test_data.head()

In [None]:
ID_train = train_data['train_id']
ID_test = test_data['test_id']

In [None]:
train_data.drop("train_id", axis = 1, inplace = True)
train_data.drop("category_name", axis = 1, inplace = True)

test_data.drop("test_id", axis = 1, inplace = True)
test_data.drop("category_name", axis = 1, inplace = True)

train_data.drop("price", axis = 1, inplace = True)


In [None]:
train_data['desc_length'] = train_data['item_description'].apply(len)
test_data['desc_length'] = test_data['item_description'].apply(len)

In [None]:
train_data['desc_length'].plot(bins=20, kind='hist')

In [None]:
print(train_data.shape)
print(test_data.shape)
ntrain = train_data.shape[0]
ntest = test_data.shape[0]


In [None]:
Combined_data = pd.concat([train_data,test_data]).reset_index(drop=True)

In [None]:
print("Combined size is : {}".format(Combined_data.shape))

In [None]:

categorical_features = Combined_data.select_dtypes(include = ["object"]).columns
numerical_features = Combined_data.select_dtypes(exclude = ["object"]).columns
print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))

In [None]:
Combined_data_numerical = Combined_data[numerical_features]


In [None]:
categorical_features

In [None]:
count = CountVectorizer()
X_name = count.fit_transform(Combined_data['name'])
X_name.shape

In [None]:
vector_brand = LabelBinarizer(sparse_output=True)
X_brand = vector_brand.fit_transform(Combined_data['brand_name'])
X_brand.shape

In [None]:
vector_L0 = LabelBinarizer(sparse_output=True)
X_L0 = vector_brand.fit_transform(Combined_data['L0'])
X_L0.shape

In [None]:
vector_L1 = LabelBinarizer(sparse_output=True)
X_L1 = vector_brand.fit_transform(Combined_data['L1'])
X_L1.shape

In [None]:
vector_L2 = LabelBinarizer(sparse_output=True)
X_L2 = vector_brand.fit_transform(Combined_data['L2'])
X_L2.shape

In [None]:
num_features = Combined_data_numerical.values
num_features.shape

In [None]:

tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,1))
X_description = tfidf_vec.fit_transform(Combined_data['item_description'])
X_description.shape

In [None]:
X = sparse.hstack((X_name, X_brand, X_brand, X_L0, X_L1, X_L2, num_features, X_description )).tocsr()
X.shape

In [None]:
train = X[:ntrain]
test = X[ntrain:]
train.shape


In [None]:
def evaluate_model(X, y, algorithm):
    
    X_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.20, random_state = 1)
    
    print(algorithm)
    print()
    print('Train - Mean Squared Error')
    print((np.sqrt(-cross_val_score(algorithm, X_train, y_train, scoring="neg_mean_squared_error", cv = 2)).mean()))
    print()
    print('Test - Mean Squared Error')
    print((np.sqrt(-cross_val_score(algorithm, X_test, y_test, scoring="neg_mean_squared_error", cv = 2)).mean()))
    print()

    pipe.fit(X_train, y_train)
    y_train_pred = pipe.predict(X_train)
    y_test_pred = pipe.predict(X_test)
       
solver = "lsqr"

In [None]:
#pipe = make_pipeline(RobustScaler(), LinearRegression())
#evaluate_model(train_data, y, pipe)
pipe = make_pipeline(Ridge())
evaluate_model(train, y, pipe)

In [None]:
# Plot residuals
#    plt.scatter(y_train_pred, y_train_pred - y_train, c = "blue", marker = "s", label = "Training data")
#    plt.scatter(y_test_pred, y_test_pred - y_test, c = "lightgreen", marker = "s", label = "Validation data")
#    plt.xlabel("Predicted values")
#    plt.ylabel("Residuals")
#    plt.legend(loc = "upper left")
#    plt.hlines(y = 0, xmin = 1.5, xmax = 9, color = "red")
#    plt.show()

# Plot predictions
#    plt.scatter(y_train_pred, y_train, c = "blue", marker = "s", label = "Training data")
#    plt.scatter(y_test_pred, y_test, c = "lightgreen", marker = "s", label = "Validation data")
#    plt.xlabel("Predicted values")
#    plt.ylabel("Real values")
#    plt.legend(loc = "upper left")
#    plt.plot([1.5, 9], [1.5, 9], c = "red")
#    plt.show()  

In [None]:
 ridge = Ridge(solver='auto', fit_intercept=True, alpha=0.4, max_iter=200, normalize=False, tol=0.01)
 ridge.fit(train, y)

In [None]:
labels_ridge = np.expm1(ridge.predict(test))

In [None]:
pd.DataFrame({'test_id': ID_test , 'price': labels_ridge}).to_csv('MercariPredictions.csv', index =False) 