In [4]:
# Loading dataset
import pandas as pd
from pandas import read_csv
import numpy as np
filename = 'sales - Sheet1.csv'
data = read_csv(filename)
data.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month,sales_in_fourth_month,sales_in_fifith_month,sales_in_sixth_month
0,,2,500,300,200,123,32
1,,4,300,650,900,1212,1434
2,four,600,200,400,305,456,678
3,nine,450,320,650,987,1231,1567
4,seven,600,250,350,234,213,423


In [5]:
# Checking missing values
data.isnull().sum()

rate                     2
sales_in_first_month     0
sales_in_second_month    0
sales_in_third_month     0
sales_in_fourth_month    0
sales_in_fifith_month    0
sales_in_sixth_month     0
dtype: int64

In [7]:
# Mean of columns
for col in data.columns:
    if data[col].dtype == 'object':
        try:
            data[col] = pd.to_numeric(data[col], errors='coerce')

        except:
            pass


In [8]:
# Filling missing values
data['rate'].fillna(0, inplace=True)
data['sales_in_first_month'].fillna(data['sales_in_first_month'].mean(), inplace=True)
data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['rate'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['sales_in_first_month'].fillna(data['sales_in_first_month'].mean(), inplace=True)


Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month,sales_in_fourth_month,sales_in_fifith_month,sales_in_sixth_month
0,0.0,2,500,300,200,123,32
1,0.0,4,300,650,900,1212,1434
2,0.0,600,200,400,305,456,678
3,0.0,450,320,650,987,1231,1567
4,0.0,600,250,350,234,213,423


In [23]:
# Feature selection
X = data.iloc[:, :6]
y = data.iloc[:, -1]

In [24]:
X.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month,sales_in_fourth_month,sales_in_fifith_month
0,0.0,2,500,300,200,123
1,0.0,4,300,650,900,1212
2,0.0,600,200,400,305,456
3,0.0,450,320,650,987,1231
4,0.0,600,250,350,234,213


In [25]:
y.head()

0      32
1    1434
2     678
3    1567
4     423
Name: sales_in_sixth_month, dtype: int64

In [26]:
# Convert words to numbers
def convert_to_int(word):
    word_dict = {'one':1, 'two':2, 'three':3, 'four':4, 'five':5, 'six':6, 'seven':7, 'eight':8, 'nine':9, 'ten':10, 'eleven':11, 'twelve':12, 'zero':0, 0:0}
    return word_dict[word]                 

In [27]:
X['rate'] = X['rate'].apply(lambda x : convert_to_int(x))

In [28]:
X.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month,sales_in_fourth_month,sales_in_fifith_month
0,0,2,500,300,200,123
1,0,4,300,650,900,1212
2,0,600,200,400,305,456
3,0,450,320,650,987,1231
4,0,600,250,350,234,213


In [29]:
# Conctatenate
df = pd.concat([X,y], axis=1)
df.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month,sales_in_fourth_month,sales_in_fifith_month,sales_in_sixth_month
0,0,2,500,300,200,123,32
1,0,4,300,650,900,1212,1434
2,0,600,200,400,305,456,678
3,0,450,320,650,987,1231,1567
4,0,600,250,350,234,213,423


In [30]:
df.dtypes

rate                     int64
sales_in_first_month     int64
sales_in_second_month    int64
sales_in_third_month     int64
sales_in_fourth_month    int64
sales_in_fifith_month    int64
sales_in_sixth_month     int64
dtype: object

In [31]:
# Saving clean dataset
df.to_csv('clean_sales.csv')

In [32]:
# Fitting the model
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(X, y)
clf.score(X, y)

1.0

In [33]:
# Saving a model
import pickle
pickle.dump(clf, open('model.pkl', 'wb'))

In [35]:
# Making prediction
model = pickle.load(open('model.pkl', 'rb'))
print(model.predict([[4, 300, 500,600,900,1000]]))

[3622.21581291]


