In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Common things I do.  Combined here for reference

#### 1. Load a CSV into a Dataframe

In [49]:
df = pd.read_csv("data/data.csv")
df.shape

(5144, 145)

#### 2. Create a label column out of a feature

In [50]:
df["Winner"] = df["Winner"].astype('category')
df["label"] = df["Winner"].cat.codes
df.shape

(5144, 146)

#### 3. Dummify a dataframe

In [51]:
dummy_df = pd.get_dummies(df)
dummy_df.shape

(5144, 4095)

#### 4. Dummify a single feature.  Remove original feature.

In [52]:
single_feature_df = pd.get_dummies(df["weight_class"])
returned_df = pd.concat([df, single_feature_df], axis=1)
print(returned_df.shape)
returned_df = returned_df.drop('weight_class', axis=1)
returned_df.shape

(5144, 160)


(5144, 159)

#### 5. Filter out a certain value from a feature

In [53]:
df_no_draws = df[df['Winner'] != 'Draw']
df_no_draws.shape

(5061, 146)

#### 6. Remove rows that contain a null in a certain feature

In [54]:
df_stance = df[df['B_Stance'].notna()]
df_stance.shape

(4985, 146)

#### 7. Split a dataframe based on a date

In [55]:
#If it isn't a date we need to make it a date

df['date'] = pd.to_datetime(df['date'])
event_df = df[df['date'] == '05-04-19']
master_df = df[df['date'] != '05-04-19']
print(event_df.shape)
print(master_df.shape)

(12, 146)
(5132, 146)


#### 8. Create a dataframe containing only certain features of a parent dataframe

In [56]:
sub_df = df[['Winner', 'weight_class']]
sub_df.shape

(5144, 2)

#### 9. Train / Test Split

In [58]:
df_no_label = df.drop('label', axis=1)
X = df.values
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=75)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1543, 146)
(3601, 146)
(1543,)
(3601,)
