# Typical manipulations on pandas

In [None]:
# avoid anoying warnings
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (e.g. sklearn and seaborn)

In [None]:
# Rename a single column 
df.rename(columns={'old_name_1':'new_name_1'}, inplace=True)

In [None]:
# Reminder of group by
df.groupby('color').mean().quality

In [None]:
# drop columns 
df.drop(['Stnd', 'Underhood ID'], axis=1, inplace=True)

In [None]:
# replace spaces with underscores and lowercase labels 
df.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)

In [None]:
# Inspect missing data 
df.isnull().sum()

In [None]:
# drop rows with any null values in both datasets
df.dropna(inplace=True)

In [None]:
# checks if any of columns have null values - should print False
df_08.isnull().sum().any()

In [None]:
df['cyl'].value_counts()# show the values of a certain columns
df['cyl'].uniques()#  uniques
df['cyl'].nuniques()# number of uniques
df_08['greenhouse_gas_score'].astype(int) # float to integer

In [None]:
# print number of duplicates in 2008 and 2018 datasets
print(df.duplicated().sum())

In [None]:
# Fixing data types
df.dtypes # shows data types

#Extract using regular expression in this case 6 out of (6 cyl)
df_08['cyl'].str.extract('(\d+)').astype(int) 

In [None]:
# Useful queries
df.query('alcohol < 10.3 | alcohold == 0.1')
df.query('gender == "bato"')
df[df['A'].isin([3, 6])]
df[~df['A'].isin([3, 6])] #not in

In [None]:
# save progress for the next section
df.to_csv('name.csv', index=False)

## Missing data

In [None]:
# Imputing
df.groupby(['Title','SibSp','Parch','Pclass'])["Age"].apply(lambda x: x.fillna(x.median()))

In [None]:
df.bare_nuclei.fillna(1) # fill with particular value

In [None]:
from imblearn.over_sampling import SMOTE
def makeOverSamplesSMOTE(X,y):
    #input DataFrame
    #X Independent Variable in DataFrame\
    #y dependent Variable in Pandas DataFrame format
    
    sm = SMOTE()
    X, y = sm.fit_sample(X, y)
    return X,y
X_aug, y_aug = makeOverSamplesSMOTE(X_train, y_train)

## Some typical feature engineering

In [None]:
df['A'].apply(lambda x: x/10 if x>10 else x) ## apply

In [None]:
# Something more elaborate
def loan_amount(columns):
    total_credit_payments = columns[0]
    worst_delinquency_estimated = columns[1]
    worst_delinquency_past_due_estimated = columns[2]
    amount_to_pay_next_payment = columns[3]
    maximum_credit_amount = columns[4]
     
    if worst_delinquency_estimated != 0.:
        return total_credit_payments * (
            worst_delinquency_past_due_estimated 
            / worst_delinquency_estimated)
    
    elif (amount_to_pay_next_payment != 0.) & (total_credit_payments != 0):
        return total_credit_payments * amount_to_pay_next_payment
    
    elif maximum_credit_amount != 0:
        return maximum_credit_amount
    
    else:
        raise ValueError
        


temp_list = ['total_credit_payments', 
             'worst_delinquency_estimated', 
             'worst_delinquency_past_due_estimated', 
             'amount_to_pay_next_payment', 
             'maximum_credit_amount']

df['loan_amount'] = df[temp_list].apply(loan_amount,axis = 1)