Does LInear Regression

In [None]:
import pprint
import numpy as np
# np.set_printoptions(threshold=np.inf)

# entry[0] = itemid
# entry[1] = name
# entry[2] = condition_id
# entry[3] = category_name
# entry[4] = brand_name
# entry[5] = price
# entry[6] = shipping
# entry[7] = description


class MercariAnalysis:
    def __init__(self):
        self.trainfn = ''
        self.testfn = ''

        self.data = {}

        self.x_train = []
        self.y_train = []
        self.x_test = []
        self.y_test = []

        self.cpc = {}
        self.cpbpc = {}

        self.mu = 0
        self.sigma = 0

        self.predictions = []
        self.theta = []
        self.final_predictions = []

    def print_data(self):
        print('-----------------------')
        print('-----------------------')
        print('-----------------------')
        pprint.pprint(self.x_train)
        print('-----------------------')
        print('-----------------------')
        print('-----------------------')
        pprint.pprint(self.y_train)
        print('-----------------------')
        print('-----------------------')
        print('-----------------------')

    def build_features(self):
        # pprint.pprint(self.data['0'])
        for entry in self.data:
            feat = [1]
            entry = self.data[entry]
            self.y_train.append(float(entry[5]))
            # build features
            # brand weight per category
            cpbpcavg = self.cpbpc[entry[3]][entry[4]]
            cpcavg = self.cpc[entry[3]][0]
            cpcstd = self.cpc[entry[3]][1]
            if cpcstd == 0:
                brand_weight = 0
            else:
                brand_weight = (cpbpcavg - cpcavg) / float(cpcstd)
            # feat.append(brand_weight)

            # condition
            # cond = int(entry[2])
            # feat.append(cond)

            # shipping included or not
            if entry[6] == 0:
                shipping = -1
            else:
                shipping = 1
            # feat.append(shipping)

            feat = [1,
                    brand_weight,
                    brand_weight ** 2,
                    brand_weight ** 3,
                    shipping,
                    shipping ** 2,
                    shipping ** 3
                    ]
            # append to matrix
            self.x_train.append(feat)

        self.x_train = np.matrix(self.x_train)

        # mu, sigma = self.normalize_features()
        # self.mu = mu
        # self.sigma = sigma
        print(self.x_train)
        # self.print_data()

    def normalize_features(self):
        mu = np.mean(self.x_train, axis=0)
        sigma = np.std(self.x_train, axis=0)
        # the slicing prevents normalizing the 1's column
        # without slicing, it looks like: x = (x - mu) / sigma
        self.x_train[:, 1:] = (self.x_train[:, 1:] - mu[:, 1:]) / sigma[:, 1:]
        return mu, sigma

    def normalize_parameters(self, mu, sigma):
        # the slicing prevents normalizing the 1's column
        # without slicing, it looks like: x = (x - mu) / sigma
        self.x_test[:, 1:] = (self.x_test[:, 1:] - mu[:, 1:]) / sigma[:, 1:]
        self.x_test[:, 0] = 1

    def parse(self):
        cpc = {}
        cpbpc = {}
        first_line = True
        with open(self.trainfn) as tsv:
            for line in tsv:
                # parsing line by line from here
                if first_line:
                    first_line = False
                    continue
                entry = line.strip().split('\t')
                self.data[entry[0]] = entry

                # getting weighted brand averages
                if entry[3] in cpc:
                    # cpc[entry[3]] = [cpc[entry[3]][0] + float(entry[5]), cpc[entry[3]][1] + 1]
                    cpc[entry[3]].append(float(entry[5]))
                else:
                    cpc[entry[3]] = [float(entry[5])]

                if entry[3] in cpbpc:
                    if entry[4] in cpbpc[entry[3]]:
                        # brand and category exist
                        cpbpc[entry[3]][entry[4]] = \
                            [cpbpc[entry[3]][entry[4]][0] + float(entry[5]), cpbpc[entry[3]][entry[4]][1] + 1]
                    else:
                        # category exists, brand does not
                        cpbpc[entry[3]][entry[4]] = [float(entry[5]), 1]
                else:
                    # cat does not exist
                    cpbpc[entry[3]] = {entry[4]: [float(entry[5]), 1]}
                # finished with weighted brand averages

                # self.build_features(entry)

            for entry in cpc:
                mean = np.mean(cpc[entry])
                std = np.std(cpc[entry])
                cpc[entry] = [mean, std]
            # pprint.pprint(cpc)
            # pprint.pprint(cpbpc)
            for cat in cpbpc:
                for brand in cpbpc[cat]:
                    cpbpc[cat][brand] = cpbpc[cat][brand][0] / float(cpbpc[cat][brand][1])
            # pprint.pprint(cpbpc)
            self.cpbpc = cpbpc
            self.cpc = cpc

    # vectorized gradient descent
    def gradient_descent(self, alpha, num_iters):
        x_np = np.matrix(self.x_train)
        y_np = np.matrix(self.y_train).T
        m = x_np.shape[0]
        n = x_np.shape[1]
        theta = np.zeros((n, 1))

        for k in range(num_iters):
            # print(k)
            delta = np.zeros((x_np.shape[1], 1))
            for i in range(m):
                # in octave
                # delta_it = ((X(i,:)*theta)-y(i,:))*X(i,:);
                delta_it = ((x_np[i, :] * theta) - y_np[i, :]) * x_np[i, :]
                delta = delta + delta_it.T
            delta = (1 / float(m)) * delta
            theta = theta - (alpha * delta)

        # diff = x_np * theta - y_np.T
        # diff_sq = [d ** 2 for d in diff.T.tolist()[0]]
        # mse = np.mean(diff_sq)
        # print("MSE for set: " + str(mse))
        # print("The MSE is crazy high because we're squaring big numbers.")
        # print("Predicted price for a 2 bedroom 2,000 sq ft home: ")
        # feat_np = self.normalize_parameters(np.matrix([1, 2000, 2]), mu, sigma)
        # predicted_value = feat_np * theta
        # print(predicted_value[0, 0])

        self.theta = theta
        return theta

    def normal_eqn(self):
        x_np = np.matrix(self.x_train)
        y_np = np.matrix(self.y_train)
        theta = np.linalg.pinv(x_np.T * x_np) * x_np.T * y_np.T
        print("theta from n_e: ")
        print(theta)
        self.theta = theta

    def build_test_feat(self, entry):
        feat = [1]
        # entry = self.data[entry]
        if entry[3] in self.cpbpc and entry[4] in self.cpbpc[entry[3]]:
            cpbpcavg = self.cpbpc[entry[3]][entry[4]]
            cpcavg = self.cpc[entry[3]][0]
            cpcstd = self.cpc[entry[3]][1]

            if cpcstd == 0:
                brand_weight = 0
            else:
                brand_weight = (cpbpcavg - cpcavg) / float(cpcstd)
        else:
            brand_weight = 0
        # feat.append(brand_weight)
        # cond = int(entry[2])
        # feat.append(cond)
        if entry[5] == 0:
            shipping = -1
        else:
            shipping = 1
        # feat.append(shipping)
        feat = [1,
                brand_weight,
                brand_weight ** 2,
                brand_weight ** 3,
                shipping,
                shipping ** 2,
                shipping ** 3,
                ]
        return feat

    def make_predictions(self):
        first_line = True
        with open(self.testfn) as tsv:
            for line in tsv:
                if first_line:
                    first_line = False
                    continue
                entry = line.strip().split('\t')
                self.x_test.append(self.build_test_feat(entry))

        self.x_test = np.matrix(self.x_test)
        # self.normalize_parameters(self.mu, self.sigma)
        self.predictions = self.x_test * self.theta

    def run(self, trainfn, testfn):
        self.trainfn = trainfn
        self.testfn = testfn
        self.parse()
        self.build_features()
        self.normal_eqn()
        # self.gradient_descent(.01, 1000)
        self.make_predictions()
        # pprint.pprint(self.predictions)
        self.predictions = self.predictions.tolist()
        for item in self.predictions:
            if item[0] < 0:
                self.final_predictions.append(0)
            else:
                self.final_predictions.append(item[0])
        # pprint.pprint(self.final_predictions)


ma = MercariAnalysis()
#ma.run("train.tsv", "test.tsv")
ma.run("../input/train.tsv", "../input/test.tsv")
# ma.predictions = list youre looking for

Okay, now let's read in the test data and generate our submission file.

In [None]:
import pandas as pd
df_test = pd.read_csv('../input/test.tsv', sep='\t')
df_test.head()

In [None]:
df_test['item_description'].fillna('Missing', inplace=True)
df_test['price'] = ma.final_predictions
df_test.head()

In [None]:
df_test[['test_id','price']].to_csv('output.csv', index=False)