## Handling Structural Errors


Structural errors are those that arise during measurement, data transfer, or other types of "poor housekeeping.", such as: 
  - Data Type Conversion
  - Syntax Errors (Remove white spaces)
  - Fix Typos

In [1]:
import pandas as pd

In [2]:
# Import the dataset

df_titanic = pd.read_csv("titanic_train_1.csv")

In [3]:
df_titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,Third,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,$7.25,,S
1,2,1,First,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,$71.28,C85,C
2,3,1,Third,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,$7.93,,S
3,4,1,First,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,$53.10,C123,S
4,5,0,Third,"Allen, Mr. William Henry",male,35.0,0,0,373450,$8.05,,S


In [6]:
df_titanic.drop(["Name","Ticket"],inplace=True,axis=1)

In [7]:
df_titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Gender,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,Third,male,22.0,1,0,$7.25,,S
1,2,1,First,female,38.0,1,0,$71.28,C85,C
2,3,1,Third,female,26.0,0,0,$7.93,,S
3,4,1,First,female,35.0,1,0,$53.10,C123,S
4,5,0,Third,male,35.0,0,0,$8.05,,S


### Data Type Conversion

In [8]:
df_titanic.dtypes

PassengerId      int64
Survived         int64
Pclass          object
Gender          object
Age            float64
SibSp            int64
Parch            int64
Fare            object
Cabin           object
Embarked        object
dtype: object

In [4]:
## Lets use some calculation on fare column
## Lets calculate the mean value of Fare
df_titanic.Fare.mean()

TypeError: Could not convert $7.25 $71.28 $7.93 $53.10 $8.05 $8.46 $51.86 $21.08 $11.13 $30.07 $16.70 $26.55 $8.05 $31.28 $7.85 $16 $29.13 $13 $18 $7.23 $26 $13 $8.03 $35.50 $21.08 $31.39 $7.23 $263 $7.88 $7.90 $27.72 $146.52 $7.75 $10.50 $82.17 $52 $7.23 $8.05 $18 $11.24 $9.48 $21 $7.90 $41.58 $7.88 $8.05 $15.50 $7.75 $21.68 $17.80 $39.69 $7.80 $76.73 $26 $61.98 $35.50 $10.50 $7.23 $27.75 $46.90 $7.23 $80 $83.48 $27.90 $27.72 $15.25 $10.50 $8.16 $7.93 $8.66 $10.50 $46.90 $73.50 $14.45 $56.50 $7.65 $7.90 $8.05 $29 $12.48 $9 $9.50 $7.79 $47.10 $10.50 $15.85 $34.38 $8.05 $263 $8.05 $8.05 $7.85 $61.18 $20.58 $7.25 $8.05 $34.65 $63.36 $23 $26 $7.90 $7.90 $77.29 $8.65 $7.93 $7.90 $7.65 $7.78 $7.90 $24.15 $52 $14.45 $8.05 $9.83 $14.46 $7.93 $7.75 $21 $247.52 $31.28 $73.50 $8.05 $30.07 $13 $77.29 $11.24 $7.75 $7.14 $22.36 $6.98 $7.90 $7.05 $14.50 $26 $13 $15.05 $26.28 $53.10 $9.22 $79.20 $15.25 $7.75 $15.85 $6.75 $11.50 $36.75 $7.80 $34.38 $26 $13 $12.53 $66.60 $8.05 $14.50 $7.31 $61.38 $7.73 $8.05 $8.66 $69.55 $16.10 $15.75 $7.78 $8.66 $39.69 $20.53 $55 $27.90 $25.93 $56.50 $33.50 $29.13 $11.13 $7.93 $30.70 $7.85 $25.47 $28.71 $13 $0 $69.55 $15.05 $31.39 $39 $22.03 $50 $15.50 $26.55 $15.50 $7.90 $13 $13 $7.85 $26 $27.72 $146.52 $7.75 $8.40 $7.75 $13 $9.50 $69.55 $6.50 $7.23 $8.05 $10.46 $15.85 $18.79 $7.75 $31 $7.05 $21 $7.25 $13 $7.75 $113.28 $7.93 $27 $76.29 $10.50 $8.05 $13 $8.05 $7.90 $90 $9.35 $10.50 $7.25 $13 $25.47 $83.48 $7.78 $13.50 $31.39 $10.50 $7.55 $26 $26.25 $10.50 $12.28 $14.45 $15.50 $10.50 $7.13 $7.23 $90 $7.78 $14.50 $52.55 $26 $7.25 $10.46 $26.55 $16.10 $20.21 $15.25 $79.20 $86.50 $512.33 $26 $7.75 $31.39 $79.65 $0 $7.75 $10.50 $39.69 $7.78 $153.46 $135.63 $31 $0 $19.50 $29.70 $7.75 $77.96 $7.75 $0 $29.13 $20.25 $7.75 $7.85 $9.50 $8.05 $26 $8.66 $9.50 $7.90 $13 $7.75 $78.85 $91.08 $12.88 $8.85 $7.90 $27.72 $7.23 $151.55 $30.50 $247.52 $7.75 $23.25 $0 $12.35 $8.05 $151.55 $110.88 $108.90 $24 $56.93 $83.16 $262.38 $26 $7.90 $26.25 $7.85 $26 $14 $164.87 $134.50 $7.25 $7.90 $12.35 $29 $69.55 $135.63 $6.24 $13 $20.53 $57.98 $23.25 $28.50 $153.46 $18 $133.65 $7.90 $66.60 $134.50 $8.05 $35.50 $26 $263 $13 $13 $13 $13 $13 $16.10 $15.90 $8.66 $9.23 $35 $7.23 $17.80 $7.23 $9.50 $55 $13 $7.88 $7.88 $27.90 $27.72 $14.45 $7.05 $15.50 $7.25 $75.25 $7.23 $7.75 $69.30 $55.44 $6.50 $8.05 $135.63 $21.08 $82.17 $7.25 $211.50 $4.01 $7.78 $227.53 $15.74 $7.93 $52 $7.90 $73.50 $46.90 $13 $7.73 $12 $120 $7.80 $7.93 $113.28 $16.70 $7.80 $7.85 $26 $10.50 $12.65 $7.93 $8.05 $9.83 $15.85 $8.66 $21 $7.75 $18.75 $7.78 $25.47 $7.90 $6.86 $90 $0 $7.93 $8.05 $32.50 $13 $13 $24.15 $7.90 $7.73 $7.88 $14.40 $20.21 $7.25 $26 $26 $7.75 $8.05 $26.55 $16.10 $26 $7.13 $55.90 $120 $34.38 $18.75 $263 $10.50 $26.25 $9.50 $7.78 $13 $8.11 $81.86 $19.50 $26.55 $19.26 $30.50 $27.75 $19.97 $27.75 $89.10 $8.05 $7.90 $26.55 $51.86 $10.50 $7.75 $26.55 $8.05 $38.50 $13 $8.05 $7.05 $0 $26.55 $7.73 $19.26 $7.25 $8.66 $27.75 $13.79 $9.84 $52 $21 $7.05 $7.52 $12.29 $46.90 $0 $8.05 $9.59 $91.08 $25.47 $90 $29.70 $8.05 $15.90 $19.97 $7.25 $30.50 $49.50 $8.05 $14.46 $78.27 $15.10 $151.55 $7.80 $8.66 $7.75 $7.63 $9.59 $86.50 $108.90 $26 $26.55 $22.53 $56.50 $7.75 $8.05 $26.29 $59.40 $7.50 $34.02 $10.50 $24.15 $26 $7.90 $93.50 $7.90 $7.23 $57.98 $7.23 $7.75 $10.50 $221.78 $7.93 $11.50 $26 $7.23 $7.23 $22.36 $8.66 $26.25 $26.55 $106.43 $14.50 $49.50 $71 $31.28 $31.28 $26 $106.43 $26 $26 $13.86 $20.53 $36.75 $110.88 $26 $7.83 $7.23 $7.78 $26.55 $39.60 $227.53 $79.65 $17.40 $7.75 $7.90 $13.50 $8.05 $8.05 $24.15 $7.90 $21.08 $7.23 $7.85 $10.50 $51.48 $26.39 $7.75 $8.05 $14.50 $13 $55.90 $14.46 $7.93 $30 $110.88 $26 $40.13 $8.71 $79.65 $15 $79.20 $8.05 $8.05 $7.13 $78.27 $7.25 $7.75 $26 $24.15 $33 $0 $7.23 $56.93 $27 $7.90 $42.40 $8.05 $26.55 $15.55 $7.90 $30.50 $41.58 $153.46 $31.28 $7.05 $15.50 $7.75 $8.05 $65 $14.40 $16.10 $39 $10.50 $14.45 $52.55 $15.74 $7.85 $16.10 $32.32 $12.35 $77.96 $7.90 $7.73 $30 $7.05 $30.50 $0 $27.90 $13 $7.93 $26.25 $39.69 $16.10 $7.85 $69.30 $27.90 $56.50 $19.26 $76.73 $7.90 $35.50 $7.55 $7.55 $7.90 $23 $8.43 $7.83 $6.75 $73.50 $7.90 $15.50 $13 $113.28 $133.65 $7.23 $25.59 $7.50 $7.93 $73.50 $13 $7.78 $8.05 $52 $39 $52 $10.50 $13 $0 $7.78 $8.05 $9.84 $46.90 $512.33 $8.14 $76.73 $9.23 $46.90 $39 $41.58 $39.69 $10.17 $7.80 $211.34 $57 $13.42 $56.50 $7.23 $26.55 $13.50 $8.05 $7.73 $110.88 $7.65 $227.53 $26.29 $14.45 $7.74 $7.85 $26 $13.50 $26.29 $151.55 $15.25 $49.50 $26.55 $52 $9.48 $13 $7.65 $227.53 $10.50 $15.50 $7.78 $33 $7.05 $13 $13 $53.10 $8.66 $21 $7.74 $26 $7.93 $211.34 $18.79 $0 $13 $13 $16.10 $34.38 $512.33 $7.90 $7.90 $30 $78.85 $262.38 $16.10 $7.93 $71 $20.25 $13 $53.10 $7.75 $23 $12.48 $9.50 $7.90 $65 $14.50 $7.80 $11.50 $8.05 $86.50 $14.50 $7.13 $7.23 $120 $7.78 $77.96 $39.60 $7.75 $24.15 $8.36 $9.50 $7.85 $10.50 $7.23 $23 $7.75 $7.75 $12.48 $7.74 $211.34 $7.23 $57 $30 $23.45 $7.05 $7.25 $7.50 $29.13 $20.58 $79.20 $7.75 $26 $69.55 $30.70 $7.90 $13 $25.93 $8.68 $7.23 $24.15 $13 $26.25 $120 $8.52 $6.98 $7.78 $0 $7.78 $13 $53.10 $7.89 $24.15 $10.50 $31.28 $8.05 $0 $7.93 $37.00 $6.45 $27.90 $93.50 $8.66 $0 $12.48 $39.69 $6.95 $56.50 $37.00 $7.75 $80 $14.45 $18.75 $7.23 $7.85 $8.30 $83.16 $8.66 $8.05 $56.50 $29.70 $7.93 $10.50 $31 $6.44 $8.66 $7.55 $69.55 $7.90 $33 $89.10 $31.28 $7.78 $15.25 $39.40 $26 $9.35 $164.87 $26.55 $19.26 $7.23 $14.11 $11.50 $25.93 $69.55 $13 $13 $13.86 $50.50 $9.50 $11.13 $7.90 $52.55 $5 $9 $24 $7.23 $9.85 $7.90 $7.90 $83.16 $26 $7.90 $10.52 $10.50 $7.05 $29.13 $13 $30 $23.45 $30 $7.75  to numeric

We can see that Fare is showing the data type "object". We need to change the data type of this column.

In [7]:
## We will replace the dtype "object" to "float"
df_titanic["Fare"] = df_titanic.Fare.str.replace("$","").astype(float)

  df_titanic["Fare"] = df_titanic.Fare.str.replace("$","").astype(float)


In [8]:
df_titanic.dtypes

PassengerId      int64
Survived         int64
Pclass          object
Name            object
Gender          object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [12]:
df_titanic.Fare

0       7.25
1      71.28
2       7.93
3      53.10
4       8.05
       ...  
886    13.00
887    30.00
888    23.45
889    30.00
890     7.75
Name: Fare, Length: 891, dtype: float64

In [13]:
## Lets use some calculation on fare column
## Lets calculate the mean value of Fare
df_titanic.Fare.mean()

32.20501683501681

### Syntax Errors

#### Remove White Spaces
Extra white spaces at the beginning or the end of a string should be removed.

In [14]:
import pandas as pd
df = pd.DataFrame({
    'company_code': ['Abcd','EFGF ',' fhygy','abcd',' '],
    'date_of_sale ': ['12/05/2002','16/02/1999','25/09/1998','12/02/2022','15/09/1997'],
    'sale_amount': [12348.5, 233331.2, 22.5, 2566552.0, 23.0]})

In [15]:
df

Unnamed: 0,company_code,date_of_sale,sale_amount
0,Abcd,12/05/2002,12348.5
1,EFGF,16/02/1999,233331.2
2,fhygy,25/09/1998,22.5
3,abcd,12/02/2022,2566552.0
4,,15/09/1997,23.0


In [16]:
print("Original DataFrame:")
print(df)
print("\nIs space is present?")
# Check if the string ends with a white space
#df['company_code_is_title'] = list(map(lambda x: x.endswith(" "), df['company_code']))
# Check if the string start with a white space
#df['company_code_is_title'] = list(map(lambda x: x.startswith(" "), df['company_code']))
# strip : Remove spaces at the beginning and at the end of the string:
df['company_code_is_title'] = list(map(lambda x: x.strip(), df['company_code']))
print(df)

Original DataFrame:
  company_code date_of_sale   sale_amount
0         Abcd    12/05/2002      12348.5
1        EFGF     16/02/1999     233331.2
2        fhygy    25/09/1998         22.5
3         abcd    12/02/2022    2566552.0
4                 15/09/1997         23.0

Is space is present?
  company_code date_of_sale   sale_amount company_code_is_title
0         Abcd    12/05/2002      12348.5                  Abcd
1        EFGF     16/02/1999     233331.2                  EFGF
2        fhygy    25/09/1998         22.5                 fhygy
3         abcd    12/02/2022    2566552.0                  abcd
4                 15/09/1997         23.0                      


In [17]:
df

Unnamed: 0,company_code,date_of_sale,sale_amount,company_code_is_title
0,Abcd,12/05/2002,12348.5,Abcd
1,EFGF,16/02/1999,233331.2,EFGF
2,fhygy,25/09/1998,22.5,fhygy
3,abcd,12/02/2022,2566552.0,abcd
4,,15/09/1997,23.0,


#### Lets apply on Titanic Dataset

We can see in graph that there are 4 classes but ideally there should be only 2 clases.
It seems like there is typo error. "fem" should be "female" and "mal" should be "male"

In [None]:
# https://xspdf.com/resolution/58311140.html