**Get the Descriptive Statistics for Pandas DataFrame**

In [2]:
from pandas import DataFrame

Cars = {'Brand': ['Honda Civic','Ford Focus','Toyota Corolla','Toyota Corolla','Audi A4'],
        'Price': [22000,27000,25000,29000,35000],
         'Year': [2014,2015,2016,2017,2018]
        }

df = DataFrame(Cars, columns= ['Brand', 'Price','Year'])
print (df)

            Brand  Price  Year
0     Honda Civic  22000  2014
1      Ford Focus  27000  2015
2  Toyota Corolla  25000  2016
3  Toyota Corolla  29000  2017
4         Audi A4  35000  2018


In [3]:
df['Price'].describe()


count        5.000000
mean     27600.000000
std       4878.524367
min      22000.000000
25%      25000.000000
50%      27000.000000
75%      29000.000000
max      35000.000000
Name: Price, dtype: float64

In [4]:
stats_numeric = df['Price'].describe().astype (int)
print (stats_numeric)

count        5
mean     27600
std       4878
min      22000
25%      25000
50%      27000
75%      29000
max      35000
Name: Price, dtype: int32


In [5]:
stats_categorical = df['Brand'].describe()
print (stats_categorical)

count                  5
unique                 4
top       Toyota Corolla
freq                   2
Name: Brand, dtype: object


In [6]:
stats = df.describe(include='all')
print (stats)

                 Brand         Price         Year
count                5      5.000000     5.000000
unique               4           NaN          NaN
top     Toyota Corolla           NaN          NaN
freq                 2           NaN          NaN
mean               NaN  27600.000000  2016.000000
std                NaN   4878.524367     1.581139
min                NaN  22000.000000  2014.000000
25%                NaN  25000.000000  2015.000000
50%                NaN  27000.000000  2016.000000
75%                NaN  29000.000000  2017.000000
max                NaN  35000.000000  2018.000000


In [7]:
count1 = df['Price'].count()
print('count: ' + str(count1))

mean1 = df['Price'].mean()
print('mean: ' + str(mean1))

std1 = df['Price'].std()
print('std: ' + str(std1))

min1 = df['Price'].min()
print('min: ' + str(min1))

quantile1 = df['Price'].quantile(q=0.25)
print('25%: ' + str(quantile1))

quantile2 = df['Price'].quantile(q=0.50)
print('50%: ' + str(quantile2))

quantile3 = df['Price'].quantile(q=0.75)
print('75%: ' + str(quantile3))

max1 = df['Price'].max()
print('max: ' + str(max1))

count: 5
mean: 27600.0
std: 4878.524367060188
min: 22000
25%: 25000.0
50%: 27000.0
75%: 29000.0
max: 35000


**Convert Strings to Floats in Pandas DataFrame**

In [8]:
import pandas as pd

Data = {'Product': ['ABC','XYZ'],
          'Price': ['250','270']}

df = pd.DataFrame(Data)
print (df)
print (df.dtypes)

  Product Price
0     ABC   250
1     XYZ   270
Product    object
Price      object
dtype: object


In [9]:

df['Price'] = df['Price'].astype(float)

print (df)
print (df.dtypes)

  Product  Price
0     ABC  250.0
1     XYZ  270.0
Product     object
Price      float64
dtype: object


In [10]:
import pandas as pd

Data = {'Product': ['AAA','BBB','CCC','DDD'],
          'Price': ['250','ABC260','270','280XYZ']}

df = pd.DataFrame(Data)
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

print (df)
print(df.dtypes)

  Product  Price
0     AAA  250.0
1     BBB    NaN
2     CCC  270.0
3     DDD    NaN
Product     object
Price      float64
dtype: object


In [11]:
import pandas as pd
import numpy as np

Data = {'Product': ['AAA','BBB','CCC','DDD'],
          'Price': ['250','ABC260','270','280XYZ']}
df = pd.DataFrame(Data)
df ['Price'] = pd.to_numeric(df['Price'], errors='coerce')
df = df.replace(np.nan, 0, regex=True)

print (df)
print(df.dtypes)

  Product  Price
0     AAA  250.0
1     BBB    0.0
2     CCC  270.0
3     DDD    0.0
Product     object
Price      float64
dtype: object


In [12]:
from pandas import DataFrame

Data = {'Identifier': ['55555-abc','77777-xyz','99999-mmm']}
df = DataFrame(Data, columns= ['Identifier'])
Left = df['Identifier'].str[:5]

print (Left)

0    55555
1    77777
2    99999
Name: Identifier, dtype: object


In [14]:
a='123456789'
a[5:]

'6789'

In [15]:
from pandas import DataFrame

Data = {'Identifier': ['ID-55555','ID-77777','ID-99999']}
df = DataFrame(Data, columns= ['Identifier'])
Right = df['Identifier'].str[-5:]

print (Right)

0    55555
1    77777
2    99999
Name: Identifier, dtype: object


In [16]:
from pandas import DataFrame

Data = {'Identifier': ['ID-55555-End','ID-77777-End','ID-99999-End']}
df = DataFrame(Data, columns= ['Identifier'])
Mid = df['Identifier'].str[3:8]

print (Mid)

0    55555
1    77777
2    99999
Name: Identifier, dtype: object


In [31]:
from pandas import DataFrame

Data = {'Identifier': ['111-IDAA','2222222-IDB','33-IDCCC']}
df = DataFrame(Data, columns= ['Identifier'])
BeforeSymbol = df['Identifier'].str.split('-').str[0]

print (BeforeSymbol)

0        111
1    2222222
2         33
Name: Identifier, dtype: object


In [32]:
from pandas import DataFrame

Data = {'Identifier': ['111 IDAA','2222222 IDB','33 IDCCC']}
df = DataFrame(Data, columns= ['Identifier'])
BeforeSpace = df['Identifier'].str.split(' ').str[0]

print (BeforeSpace)

0        111
1    2222222
2         33
Name: Identifier, dtype: object


In [33]:
from pandas import DataFrame

Data = {'Identifier': ['IDAA-111','IDB-2222222','IDCCC-33']}
df = DataFrame(Data, columns= ['Identifier'])
AfterSymbol = df['Identifier'].str.split('-').str[1]

print (AfterSymbol)

0        111
1    2222222
2         33
Name: Identifier, dtype: object


In [34]:
from pandas import DataFrame

Data = {'Identifier': ['IDAA-111-AA','IDB-2222222-B','IDCCC-33-CCC']}
df = DataFrame(Data, columns= ['Identifier'])
BetweenTwoSymbols = df['Identifier'].str.split('-').str[1]

print (BetweenTwoSymbols)

0        111
1    2222222
2         33
Name: Identifier, dtype: object


In [35]:
from pandas import DataFrame

Data = {'Identifier': ['IDAA-111$AA','IDB-2222222$B','IDCCC-33$CCC']}
df = DataFrame(Data, columns= ['Identifier'])
betweenTwoDifferentSymbols = df['Identifier'].str.split('-').str[1]
betweenTwoDifferentSymbols = betweenTwoDifferentSymbols.str.split('$').str[0]

print (betweenTwoDifferentSymbols)

0        111
1    2222222
2         33
Name: Identifier, dtype: object


**NAN -value replacement**

In [36]:
import pandas as pd

df = pd.DataFrame({'values': ['700','ABC300','500','900XYZ']})
print (df)

   values
0     700
1  ABC300
2     500
3  900XYZ
