# Preprocessing Datasets

In [1]:
import pandas as pd

## Preprocessing skuinfo

In [2]:
df = pd.read_csv("skuinfo.csv", sep = "delimieter", header = None)

  """Entry point for launching an IPython kernel.


In [3]:
df.head()

Unnamed: 0,0
0,"3,6505,113,000400000003000,00 F55KT2,WHISPE..."
1,"4,8101,002,000400000004000,22 615CZ4,SPEARM..."
2,"5,7307,003,000400000005000,7LBS 245-01,34 SIL..."
3,"8,3404,00B,000400000008000,622 F05H84,MORNIN..."
4,"15,2301,004,000400000015000,126 MDU461,255CA..."


In [4]:
# Clean lines with extraseparating columns
drops = []

for i, v in enumerate(df[0]):
    if df.iloc[i, 0].count(",") > 10:
        drops.append(i)
        
len(drops)

8148

In [5]:
df = df.drop(drops)

In [6]:
# Drop lines with more commas than the standard (10); these lines have extra columns or poorly formatted columns
drops2 = []

for i, v in enumerate (df[0]):
    if df.iloc[i, 0].count(",") > 10:
        drops.append(i)
len(drops2)

0

In [8]:
df = df[0].str.split(",", expand = True)

In [9]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,3,6505,113,400000003000,00 F55KT2,WHISPERWHITE,P8EA,1,5119207,TURNBURY,0
1,4,8101,002,400000004000,22 615CZ4,SPEARMI,S,1,3311144,C A SPOR,0
2,5,7307,003,400000005000,7LBS 245-01,34 SILVER,KING,1,5510554,BEAU IDE,0
3,8,3404,00B,400000008000,622 F05H84,MORNING MI,2T,1,2912827,HARTSTRI,0
4,15,2301,004,400000015000,126 MDU461,255CAMEL,12,1,23272,JONES/LA,0


In [10]:
# Transform data type of columns
df[0] = df[0].astype(int)
df[1] = df[1].astype(int)
df[7] = df[7].astype(int)

def strp(x):
    return x.strip()

# Remove the whitespaces
for i, v in enumerate(df.dtypes):
    if v not in ["int32", "int64"]:
        df[i] = df[i].apply(strp)

In [12]:
# Rename columns
df.rename(columns = {0: "SKU", 1: "DEPT", 2: "CLASSID", 3: "UPC", 4: "STYLE", 5: "COLOR", 
                      6: "SIZE", 7: "PACKSIZE", 8: "VENDOR", 9: "BRAND"}, inplace = True)

In [None]:
# Transform data type of columns
skuinfo["DEPT"] = skuinfo["DEPT"].astype("category")
skuinfo["SKU"] = skuinfo["SKU"].astype("category")
skuinfo["CLASSID"] = skuinfo["CLASSID"].astype("category")
skuinfo["UPC"] = skuinfo["UPC"].astype("category")
skuinfo["STYLE"] = skuinfo["STYLE"].astype("category")
skuinfo["COLOR"] = skuinfo["COLOR"].astype("category")
skuinfo["SIZE"] = skuinfo["SIZE"].astype("category")
skuinfo["BRAND"] = skuinfo["BRAND"].astype("category")
skuinfo["VENDOR"] = skuinfo["VENDOR"].astype("category")

In [14]:
# Drop the last column which is not present in the database schema
df.drop(10, axis = 1, inplace = True)

In [16]:
# Save cleaned skuinfo data to a new csv file
df.to_csv("skuinfo2.csv", index = False)

## Preprocessing Deptinfo

In [4]:
deptinfo = pd.read_csv("deptinfo.csv", sep = "delimieter", header = None)

  """Entry point for launching an IPython kernel.


In [5]:
# Split comma-seperated values into columns
deptinfo = deptinfo[0].str.split(",", expand = True)
deptinfo.head()

Unnamed: 0,0,1,2
0,800,CLINIQUE,0
1,801,LESLIE,0
2,1100,GARY F,0
3,1107,JACQUES,0
4,1202,CABERN,0


In [6]:
# Rename columns
deptinfo.rename(columns = {0: "DEPT", 1: "DEPTDESC",2:"DEPT2"}, inplace = True)

In [8]:
# Drop the last column which is not present in the database schema
deptinfo.drop(["DEPT2"],axis=1,inplace=True)

In [None]:
# Transform data type of columns
deptinfo["DEPT"] = deptinfo["DEPT"].astype("category")
deptinfo["DEPTDESC"] = deptinfo["DEPTDESC"].astype("category")

In [9]:
deptinfo.head()

Unnamed: 0,DEPT,DEPTDESC
0,800,CLINIQUE
1,801,LESLIE
2,1100,GARY F
3,1107,JACQUES
4,1202,CABERN


In [10]:
# Save cleaned skuinfo data to a new csv file
deptinfo.to_csv("deptinfo2.csv", index = False)

## Preprocessing TRNSACT

In [55]:
# Split comma-seperated values into columns
trnsact = pd.read_csv("trnsact.csv", sep = ",", dtype="category", header = None)
trnsact.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,3,202,290,1100,326708721,2005-01-18,P,1,0.0,30.0,30.0,3500000,818,0
1,3,202,540,2700,326708721,2005-01-29,R,1,0.0,30.0,30.0,15200000,818,0
2,3,303,500,2100,23702074,2004-08-18,P,1,0.0,12.0,12.0,4600000,48,0
3,3,709,360,500,0,2005-08-14,P,1,0.0,30.0,30.0,6500000,818,0
4,3,802,660,400,0,2005-08-09,P,1,440.0,30.0,30.0,4700000,599,0


In [75]:
# Rename columns
trnsact.rename(columns = {0: "SKU", 1: "STORE",2:"REGISTER", 3:"TRANNUM", 5:"SALEDATE", 4:"SEQ", 
                         11:"INTERID", 6:"STYPE", 8:"ORGPRICE", 9:"AMT", 10:"AMT2", 12:"MIC",
                          7:"QUANTITY", 13:"Unknown"}, inplace = True)

In [78]:
# Drop the Unknown column, which is not present in dataset schema
trnsact.drop(["Unknown"],axis=1,inplace=True)

In [81]:
# Transform data type of columns 
trnsact["QUANTITY"] = trnsact["QUANTITY"].astype("int")

In [82]:
# Transform data type of columns
trnsact["AMT"] = trnsact["AMT"].astype("float")
trnsact["AMT2"] = trnsact["AMT2"].astype("float")
trnsact["ORGPRICE"] = trnsact["ORGPRICE"].astype("float")

In [87]:
# Transform data type of columns
trnsact['SALEDATE'] =  pd.to_datetime(trnsact['SALEDATE'], infer_datetime_format=True)

In [88]:
trnsact.head(5)

Unnamed: 0,SKU,STORE,REGISTER,TRANNUM,SEQ,SALEDATE,STYPE,QUANTITY,ORGPRICE,AMT,AMT2,INTERID,MIC
0,3,202,290,1100,326708721,2005-01-18,P,1,0.0,30.0,30.0,3500000,818
1,3,202,540,2700,326708721,2005-01-29,R,1,0.0,30.0,30.0,15200000,818
2,3,303,500,2100,23702074,2004-08-18,P,1,0.0,12.0,12.0,4600000,48
3,3,709,360,500,0,2005-08-14,P,1,0.0,30.0,30.0,6500000,818
4,3,802,660,400,0,2005-08-09,P,1,440.0,30.0,30.0,4700000,599


In [None]:
# Save cleaned skuinfo data to a new csv file
trnsact.to_csv("trnsact2.csv", index = False)

## Preprocessing SKSTINFO

In [13]:
skstinfo = pd.read_csv("skstinfo.csv", sep = "delimieter", header = None)
skstinfo.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,0
0,"3,102,123.36,440.00,0"
1,"3,103,123.36,440.00,0"
2,"3,104,123.36,440.00,0"
3,"3,202,123.36,440.00,0"
4,"3,203,123.36,440.00,0"


In [14]:
# Split comma-seperated values into columns
skstinfo = skstinfo[0].str.split(",", expand = True)
skstinfo.head()

Unnamed: 0,0,1,2,3,4
0,3,102,123.36,440.0,0
1,3,103,123.36,440.0,0
2,3,104,123.36,440.0,0
3,3,202,123.36,440.0,0
4,3,203,123.36,440.0,0


In [15]:
# Rename columns
skstinfo.rename(columns = {0: "SKU", 1: "STORE",2:"COST",3:"RETAIL", 4:"Unknown"}, inplace = True)

In [16]:
# Drop the Unknown column, which is not present in the database schema
skstinfo.drop(["Unknown"],axis=1,inplace=True)

In [18]:
# Transform data type of columns
skstinfo["COST"] = skstinfo["COST"].astype(float)
skstinfo["RETAIL"] = skstinfo["RETAIL"].astype(float)
skstinfo["STORE"] = skstinfo["STORE"].astype("category")
skstinfo["SKU"] = skstinfo["SKU"].astype("category")

In [19]:
# Save cleaned skuinfo data to a new csv file
skstinfo.to_csv("skstinfo2.csv", index = False)

## Preprocessing STRINFO

In [29]:
strinfo = pd.read_csv("strinfo.csv", sep = "delimieter", header = None)
strinfo.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,0
0,"2,ST. PETERSBURG ,FL,33710,0"
1,"3,ST. LOUIS ,MO,63126,0"
2,"4,LITTLE ROCK ,AR,72201,0"
3,"7,FORT WORTH ,TX,76137,0"
4,"9,TEMPE ,AZ,85281,0"


In [30]:
# Split comma-seperated values into columns
strinfo = strinfo[0].str.split(",", expand = True)
strinfo.head()

Unnamed: 0,0,1,2,3,4
0,2,ST. PETERSBURG,FL,33710,0
1,3,ST. LOUIS,MO,63126,0
2,4,LITTLE ROCK,AR,72201,0
3,7,FORT WORTH,TX,76137,0
4,9,TEMPE,AZ,85281,0


In [31]:
# Rename columns
strinfo.rename(columns = {0: "STORE", 1: "CITY",2:"STATE",3:"ZIP",4:"unknown"}, inplace = True)

In [32]:
# Transform data type of column
strinfo["STORE"] = strinfo["STORE"].astype("category")
strinfo["ZIP"] = strinfo["ZIP"].astype("category")
strinfo["CITY"] = strinfo["CITY"].astype("category")
strinfo["STATE"] = strinfo["STATE"].astype("category")

In [34]:
# Drop the Unknown column, which is not present in the database schema
strinfo.drop(["unknown"],axis=1,inplace=True)

In [35]:
strinfo.head()

Unnamed: 0,STORE,CITY,STATE,ZIP
0,2,ST. PETERSBURG,FL,33710
1,3,ST. LOUIS,MO,63126
2,4,LITTLE ROCK,AR,72201
3,7,FORT WORTH,TX,76137
4,9,TEMPE,AZ,85281


In [36]:
#save cleaned skuinfo data to a new csv file
strinfo.to_csv("strinfo2.csv", index = False)