# **Predict Price Sales - Knightbearr**

note : sorry if my English typing is bad, hopefully you guys can understand.

# Step 1: Import Libraries

Import the important libraries

In [None]:
# Data and Count
import pandas as pd
import numpy as np
import math

# Visualize
import matplotlib.pyplot as plt
import seaborn as sns

# Train and Split
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Preprocessing
from sklearn.preprocessing import LabelEncoder

# Accuracy
from sklearn import metrics

# Info
from sklearn.feature_selection import mutual_info_regression
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 25)
pd.set_option('display.width', 100)
sns.set_theme(color_codes=True, style='darkgrid', palette='deep', font='sans-serif')

# Step 2: Load Data

load the data that we want to research

In [None]:
# Load data
compData = pd.read_csv('../input/sp-500-companies/constituents-financials.csv')

# Don't forget to make a backup data
backupData = compData.copy(deep=True)

# Step 3: Data Checking and Cleaning

we must check the data every time we want to make a model, because this is the important thing, if you suddenly meet a bad dataset, wether you want it or not, you must clean the dataset.

In [None]:
# Checking the first 5 rows of data
compData.head()

In [None]:
# Checking the last 5 rows of data
compData.tail()

In [None]:
# Checking the shape of data
compData.shape

In [None]:
# Checking the type of data
compData.dtypes.to_frame()

In [None]:
# Checking the null value
compData.isna().mean().to_frame()

In [None]:
compData.isnull().sum().to_frame()

In [None]:
# Cleaning the name of columns
compData.columns = compData.columns.str.replace('/', '_').str.replace(' ', '_').str.lower()
compData.head()

In [None]:
# Let's have a look for columns
compData.columns.to_frame()

In [None]:
# Looking for categorical data
objectCol = compData.loc[:, ['name', 'sector']]
objectCol

In [None]:
# Let's have a look at how many labels each variables has
for col in objectCol:
    print(f'{col} : {len(objectCol[col].unique())} labels')

In [None]:
# Let's find the top most frequent categories in name
objectCol.name.value_counts().sort_values(ascending=False).to_frame()

In [None]:
# Let's find the top most frequent categories in sector
objectCol.sector.value_counts().sort_values(ascending=False).to_frame()

as we can see in the output above, the most high frequent categories is Consumer Discretionary

In [None]:
# Let's make a list with the most frequent categories of the variable
top10 = [x for x in objectCol.sector.value_counts().sort_values(ascending=False).head(10).index]
top10

We can just drop out the Telecommunication services if you want, but for this time, i'll use it the data, since it's just have 11 categories.

# Step 4: Missing Value

Handle the missing value, since we not want have a bad prediction or decreasing our machine learning model performs

In [None]:
# Checking missing Value
compData.isnull().sum().to_frame()

In [None]:
# Make a variable to accomodate the mean value of price earnings
priceEarn = compData['price_earnings'].mean()

# Fill the missing value with the mean of data
compData['price_earnings'] = compData['price_earnings'].fillna(priceEarn)

# Check the data
compData['price_earnings'].isnull().sum()

In [None]:
# Make a variable to accomodate the mean value of price book
priceBook = compData['price_book'].mean()

# Fill the missing value with the mean of data
compData['price_book'] = compData['price_book'].fillna(priceBook)

# Check the data
compData['price_book'].isnull().sum()

In [None]:
# Check again just for make sure that missing value is gone
compData.isnull().sum().to_frame()

Okay that's great

# Step 4: Encode

We need to encode the data since this data have a categorical value, and i'm using LabelEncoder here, since this data have a lot of categorical value.

In [None]:
# Load the Encoder
labelEncoder = LabelEncoder()

In [None]:
# Accomodate data to new variable
data = compData

# Make a for loop
for i in data.columns:
    if data[i].dtype == 'object':
        labelEncoder.fit(list(data[i].values))
        data[i] = labelEncoder.transform(data[i].values)
        
        # Now let's change the data type into float
        for j in data.columns:
            if data[j].dtype == 'int':
                data[j] = data[j].astype('float64')
                
                # Accomodate the data
                compData = data

In [None]:
# Check the data
compData.dtypes.to_frame()

# Step 5: EDA

analyze and investigate data sets and summarize their main characteristics, often employing data visualization methods.

In [None]:
desc = compData.describe()
desc

In [None]:
# Make a correlation data to knowing Value Strength and Direction of Linear Relationship
corr = compData.corr()
corr

In [None]:
# Constructing a heatmap to understand the correlation
plt.figure(figsize=(12, 10))
sns.heatmap(corr, cbar=True, square=True, fmt='.1f', annot=True, annot_kws={'size': 8}, cmap='YlGnBu')
plt.plot()

In [None]:
# Checking the structure of the data
compData.sample(12, random_state=1).T

As we can see in above, the data have 52 week low and high, let's make a new feature that have the medium value of 52 week, to improve our machine learning models.

In [None]:
# Let's make a new feature to improve our machine learning models
compData['52_week_med'] = (compData['52_week_low'] - compData['52_week_high'])

In [None]:
# Okay great. let's have a look for the new feature
compData['52_week_med'].to_frame()

In [None]:
# Let's create a histogram plot
compData.hist(figsize=(12,12))
plt.show()

In [None]:
# Let's make a correlation data again
corr = compData.corr()
corr

In [None]:
# Constructing a heatmap to understand the correlation
plt.figure(figsize=(12, 10))
sns.heatmap(corr, cbar=True, square=True, fmt='.1f', annot=True, annot_kws={'size': 8}, cmap='YlGnBu')
plt.plot()

**Coefficient of Variation**

The coefficient of variation is a measure of variance that can be used to compare a data distribution that has different units.

* **The higher the Coefficient of Variation** = the wider the data you have compared to the average data (more difficult to predict)
* **The Lower Coefficient of Variation** = The narrower the data you have compared to the Average data (Easier to predict)

In [None]:
# Coefficient of Price Sales
covSale = ((compData['price_sales'].std()/compData['price_sales'].mean()) * 100)
print(f'Coefficient Of Variation Potability : {covSale}%')

okay, a little bit harder to predict that.

In [None]:
# Getting the Mutual Information about the data
X = compData.copy()
y = X.pop('price_sales')

# All discrete features should now have integer dtypes
discreateFeatures = X.dtypes == int

In [None]:
# Make a function
def makeMiScores(X, y, discreateFeatures):
    miScores = mutual_info_regression(X, y, discrete_features=discreateFeatures)
    miScores = pd.Series(miScores, name='MI Scores', index=X.columns)
    miScores = miScores.sort_values(ascending=False)
    return miScores

miScores = makeMiScores(X, y, discreateFeatures)
miScores.to_frame() # show a features with their MI scores

In [None]:
# And now bar plot to make comparisons easier
def plotMiScores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

# Figuring the plot and plotting
plt.figure(dpi=100, figsize=(6, 3))
plotMiScores(miScores)

Data visualization is a great follow-up to a utility ranking. as we can see the **sector - market_cap have a mutual information** with price_sales.

# **Step 7: Check the data again**

Check the data again.

In [None]:
# Check the data
compData

In [None]:
# Check the structure
compData.T

In [None]:
# Check missing value
compData.isnull().sum().to_frame()

In [None]:
# Check the type
compData.dtypes.to_frame()

Okay, overall the data is ready to use, let's split and train the data.

# **Step 8: Splitting the Data**

divide the data and split it using train test split module from sklearn.

In [None]:
# X data
X = compData.drop(['price_sales'], axis=1) # dropping price sales

# y data
y = compData['price_sales']

In [None]:
# Splitting Data
trainX, testX, trainY, testY = train_test_split(X, y, 
                                                test_size=.1, 
                                                random_state=12)

In [None]:
# Checking the shape
trainX.shape, trainY.shape, testX.shape, testY.shape

# **Step 9: Train and Fit the model**

Train and fit the model using **RandomForestRegressor** Algorithm.

In [None]:
modelRFC = RandomForestRegressor(n_estimators=500, min_samples_leaf=1, random_state=12)
modelRFC = modelRFC.fit(trainX, trainY)

In [None]:
# predict X train
trainPredict = modelRFC.predict(trainX)

# predict X test  
testPredict = modelRFC.predict(testX)

In [None]:
# Train X 

# R Squared 
trainRsquared = metrics.r2_score(trainY, trainPredict)
print(f'R-Squared : {trainRsquared}')

# Mean Absolute Error
trainMAE = metrics.mean_absolute_error(trainY, trainPredict)
print(f'MAE : {trainMAE}')

#  Mean Squared Error
trainMSE = metrics.mean_squared_error(trainY, trainPredict)
print(f'MSE : {trainMSE}')

#  Root Mean Squared Error
trainRMSE = math.sqrt(metrics.mean_squared_error(trainY, trainPredict))
print(f'RMSE : {trainRMSE}')

# Median
trainM = metrics.median_absolute_error(trainY, trainPredict)
print(f'MEAE : {trainM}')

In [None]:
# Test X  

# R Squared 
testRsquared = metrics.r2_score(testY, testPredict)
print(f'R-Squared : {testRsquared}')

# Mean Absolute Error
testMAE = metrics.mean_absolute_error(testY, testPredict)
print(f'MAE : {testMAE}')

#  Mean Squared Error
testMSE = metrics.mean_squared_error(testY, testPredict)
print(f'MSE : {testMSE}')

#  Root Mean Squared Error
testRMSE = math.sqrt(metrics.mean_squared_error(testY, testPredict))
print(f'RMSE : {testRMSE}')

# Median
testM = metrics.median_absolute_error(testY, testPredict)
print(f'MEAE : {testM}')

# **Step 10: Prediction**

In [None]:
trainOutput = pd.DataFrame({
    'Train Actual Price': trainY,
    'Train Predicted Price ': trainPredict})

trainOutput.to_csv('Train Prediction.csv', index=False)

In [None]:
testOutput = pd.DataFrame({
    'Test Actual Price': testY,
    'Test Predicted Price ': testPredict})

testOutput.to_csv('Test Prediction.csv', index=False)

In [None]:
trainPredictedOutput = pd.read_csv('./Train Prediction.csv')
trainPredictedOutput.head(10)

In [None]:
testPredictedOutput = pd.read_csv('./Test Prediction.csv')
testPredictedOutput.head(10)

# **Don't forget to give me a Feed back !!! Thanks !!!**