In [1]:
# Import necessary libraries
import pandas as pd
import random
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import boxcox
from scipy.special import boxcox1p
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.datasets import make_classification
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from scipy.cluster.hierarchy import dendrogram, linkage
from bayes_opt import BayesianOptimization
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [4]:
# Read the input data from csv
df = pd.read_csv('/Users/siddharthhaveliwala/dataglacierrepos/week-4/IPO_Prediction_week3/data/ipo_report_listing_day_gain.csv')

In [5]:
# Summary of dataframe df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1429 entries, 0 to 1428
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Issuer Company       1429 non-null   object 
 1   Listing Date         1428 non-null   object 
 2   Issue Price          1429 non-null   float64
 3   Lot Size             1391 non-null   float64
 4   Issue Price (Rs Cr)  1429 non-null   float64
 5   P/E Ratio            356 non-null    float64
 6   QIB                  605 non-null    float64
 7   NII                  1053 non-null   float64
 8   RII                  1049 non-null   float64
 9   EMP                  227 non-null    float64
 10  TOTAL                1056 non-null   float64
 11  Open Price           1427 non-null   float64
 12  Low Price            1427 non-null   float64
 13  High Price           1427 non-null   float64
 14  Close Price          1427 non-null   float64
 15  % Change             1427 non-null   f

In [13]:
df.head(5)

Unnamed: 0,Issuer Company,Listing Date,Issue Price,Lot Size,Issue Price (Rs Cr),P/E Ratio,QIB,NII,RII,EMP,TOTAL,Open Price,Low Price,High Price,Close Price,% Change
0,Swashthik Plascon Limited,12/5/23,86.0,1600.0,40.76,17.23,3.42,35.76,13.58,,15.43,120.1,115.25,126.1,126.1,46.63
1,Flair Writing Industries Limited,12/1/23,304.0,49.0,593.0,23.36,122.02,35.23,13.73,,49.28,503.0,452.7,514.0,452.7,48.91
2,Fedbank Financial Services Limited,11/30/23,140.0,107.0,1092.26,25.04,3.48,1.49,1.88,1.34,2.24,137.75,133.15,148.0,140.0,0.0
3,Rockingdeals Circular Economy Limited,11/30/23,140.0,1000.0,21.0,33.33,47.38,458.6,201.42,,213.64,300.0,300.0,315.0,315.0,125.0
4,Tata Technologies Limited,11/30/23,500.0,30.0,3042.51,32.53,203.41,62.11,16.5,3.7,69.43,1199.95,1199.95,1400.0,1314.25,162.85


In [14]:
df.columns, df.shape

(Index(['Issuer Company', 'Listing Date', 'Issue Price', 'Lot Size',
        'Issue Price (Rs Cr)', 'P/E Ratio', 'QIB', 'NII', 'RII', 'EMP', 'TOTAL',
        'Open Price', 'Low Price', 'High Price', 'Close Price', '% Change'],
       dtype='object'),
 (1429, 16))

In [16]:
df.isnull().sum()

Issuer Company            0
Listing Date              1
Issue Price               0
Lot Size                 38
Issue Price (Rs Cr)       0
P/E Ratio              1073
QIB                     824
NII                     376
RII                     380
EMP                    1202
TOTAL                   373
Open Price                2
Low Price                 2
High Price                2
Close Price               2
% Change                  2
dtype: int64

In [17]:
# Remove unnecessary columns and save new dataframe udf
udf = df.drop(['P/E Ratio', 'EMP', 'Open Price', 'Low Price', 'High Price', 'Close Price', 'Issuer Company', 'Listing Date'], axis=1)

# Remove NA or missing values
udf = udf.dropna()

In [19]:
udf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 598 entries, 0 to 1405
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Issue Price          598 non-null    float64
 1   Lot Size             598 non-null    float64
 2   Issue Price (Rs Cr)  598 non-null    float64
 3   QIB                  598 non-null    float64
 4   NII                  598 non-null    float64
 5   RII                  598 non-null    float64
 6   TOTAL                598 non-null    float64
 7   % Change             598 non-null    float64
dtypes: float64(8)
memory usage: 42.0 KB


In [20]:
udf.describe()

Unnamed: 0,Issue Price,Lot Size,Issue Price (Rs Cr),QIB,NII,RII,TOTAL,% Change
count,598.0,598.0,598.0,598.0,598.0,598.0,598.0,598.0
mean,311.367057,343.479933,992.548378,52.669749,88.13811,28.066455,43.046355,25.310886
std,311.830685,824.945591,2127.707364,410.349394,157.296117,72.441492,68.862965,44.393352
min,10.0,5.0,3.99,0.0,0.02,0.03,0.44,-68.92
25%,90.0,25.0,66.1575,2.34,1.95,1.745,2.2875,-2.7575
50%,190.0,54.5,400.0,10.705,12.49,5.59,11.485,12.465
75%,439.0,130.0,886.0725,53.73,101.2775,16.135,56.6175,42.295
max,2150.0,10000.0,21008.48,9999.0,958.07,592.73,450.03,270.4


In [22]:
# Remove bad data from dataframe udf
bad_record = 'R'
rows_to_remove = udf[udf['QIB'] == bad_record].index
print(rows_to_remove) # No bad data

Index([], dtype='int64')


In [23]:
# Standardize the dataset 
scaler = StandardScaler()
udf = scaler.fit_transform(udf)

In [32]:
type(udf)

numpy.ndarray