# Linear Regression

## Imports

In [35]:
import pandas as pd 
import seaborn as sns
sns.set_style('darkgrid')
import sklearn

In [61]:
df = pd.read_csv('../webscraping/moonboard_data.csv', index_col=0)
df = df.dropna()
df.head()

Unnamed: 0,holds,grades,year,angle
0,"[('A18', 'end'), ('C10', 'hold'), ('D15', 'hol...",7B,2016,40
1,"[('E18', 'end'), ('F13', 'hold'), ('G6', 'star...",7B,2016,40
2,"[('E18', 'end'), ('F14', 'hold'), ('F9', 'hold...",7B,2016,40
3,"[('B3', 'start'), ('E10', 'hold'), ('F7', 'hol...",7B,2016,40
4,"[('D10', 'hold'), ('H18', 'end'), ('H14', 'hol...",7B,2016,40


In [37]:
df['n_holds'] = df['holds'].apply(lambda x: len(eval(x)))
df['holds'] = df['holds'].apply(lambda x: eval(x))

## Encoding holds as dictionary
Here we create a custom transformer which:
- drops null values from the `DataFrame`
- parses the string of holds into a list of holds
- encodes holds as a dictionary we can pass to `DictVectorizer`

In [55]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin 
from typing import List

class DictEncoder(BaseEstimator, TransformerMixin):
    """
    Encodes a list of holds into a dictionary of key, value pairs where
    each key is a hold of the form [column index A-K][row index 1-18]
    and each value is 1
    """
    def fit(self, X: pd.DataFrame, y=None):
        return self
    
    def parseList(self, holdlist: List[str]):
        """ Parses a list of holds into a dicionary of the form {hold : 1} """
        return {hold : 1 for hold, color in holdlist}
    
    def transform(self, X: pd.DataFrame):
        X = X.dropna() # first drop missing values (problem no longer exists)
        X['holds'] = X['holds'].apply(lambda holdlist: self.parseList(eval(holdlist))) # parse strings to list of holds
        return X['holds']

In [57]:
de = DictEncoder()
de.transform(df)[0]

{'A18': 1, 'C10': 1, 'D15': 1, 'G4': 1, 'H8': 1}