In [1]:
import numpy
import pandas
import re
import os

In [2]:
season = 2013

In [3]:
input_file_name = 'data/rankings/{}_composite_rankings.csv'.format(season)

def pattern_match(pattern, string):
    return (re.search(pattern, string) is not None)

special_columns = ['Team', 'Rank', 'Conf', 'Record', 'Mean', 'Median', 'St.Dev']

In [4]:
def get_fields(width, line, data_type, n=1):
    data = list()
    for i in range(n):
        y = line[:width]
        #print '"{}"'.format(y)
        z = numpy.nan if y.strip() == '' else data_type(y.strip())
        data.append(z)
        line = line[width:]
        
    return (data, line)

def parse_line(line):
    ranker_width = 4
    section_width = 2
    rank_width = 5
    team_width = 17
    conf_width = 5
    record_width = 7
    team_short_width = 9
    float_width = 6
    float_2_width = 7

    data = list()
    temp_line = line

    # First Block
    temp_data, temp_line = get_fields(ranker_width, temp_line, int, 5)
    data.extend(temp_data)

    temp_data, temp_line = get_fields(section_width, temp_line, str)

    temp_data, temp_line = get_fields(ranker_width, temp_line, int, 5)
    data.extend(temp_data)

    temp_data, temp_line = get_fields(rank_width, temp_line, int)
    data.extend(temp_data)

    temp_data, temp_line = get_fields(team_width, temp_line, str)
    data.extend(temp_data)

    temp_data, temp_line = get_fields(conf_width, temp_line, str)
    data.extend(temp_data)

    temp_data, temp_line = get_fields(record_width, temp_line, str)
    data.extend(temp_data)

    # Blocks 2 through 4
    for i in range(3):
        for j in range(3):
            temp_data, temp_line = get_fields(section_width, temp_line, str)

            temp_data, temp_line = get_fields(ranker_width, temp_line, int, 5)
            data.extend(temp_data)

        temp_data, temp_line = get_fields(rank_width, temp_line, int)
        data.extend(temp_data)

        temp_data, temp_line = get_fields(team_short_width, temp_line, str)
        data.extend(temp_data)

    # Block 5
    for j in range(1):
        temp_data, temp_line = get_fields(section_width, temp_line, str)

        temp_data, temp_line = get_fields(ranker_width, temp_line, int, 5)
        data.extend(temp_data)

    temp_data, temp_line = get_fields(section_width, temp_line, str)
    
    temp_data, temp_line = get_fields(ranker_width, temp_line, int, 1)
    data.extend(temp_data)

    temp_data, temp_line = get_fields(section_width, temp_line, str)

    temp_data, temp_line = get_fields(float_width, temp_line, float, 2)
    data.extend(temp_data)

    temp_data, temp_line = get_fields(float_2_width, temp_line, float)
    data.extend(temp_data)

    # print zip(header[:len(data)], data)
    # print temp_line
    return data

In [5]:
with open(input_file_name, 'r') as input_file:
    for line_number, line in enumerate(input_file):
        if line_number == 0:
            header = map(lambda s: s.strip().strip(','), line.split())
            df_header = list()
            for f in header:
                if f not in df_header:
                    df_header.append(f)
            df_dict = dict([(f, list()) for f in df_header])
            continue

        # skip empty lines
        if line.strip() == '':
            continue
            
        # Check for a duplicate header line
        duplicate_header = map(lambda s: s.strip().strip(','), line.split())
        if header == duplicate_header:
            continue

        data = parse_line(line)
        recorded = list()
        for f, x in zip(header, data):
            if f not in recorded:
                df_dict[f].append(x)
                recorded.append(f)
                
df = pandas.DataFrame(df_dict)

ranker_list = sorted(list(set(df.columns) - set(special_columns)))
feature_list = list(special_columns) + ranker_list

for ranker in ranker_list:
    df[ranker] = df[ranker].fillna(df['Median'])

df[feature_list][:5]

Unnamed: 0,Team,Rank,Conf,Record,Mean,Median,St.Dev,7OT,ADE,AP,...,SPW,STH,TMR,TPR,TW,USA,WIL,WLK,WOB,WOL
0,Louisville,1,BE,29-5,1.89,2.0,1.25,1,1,2.0,...,4,1,1,2,1,2.0,1,1,2,2
1,Gonzaga,2,WCC,31-2,3.48,3.0,2.31,5,8,1.0,...,1,5,2,3,5,1.0,2,5,1,1
2,Indiana,3,B10,28-6,3.92,4.0,1.96,4,2,4.0,...,8,2,3,4,7,4.0,3,2,3,6
3,Kansas,4,B12,29-5,4.79,5.0,1.69,6,3,3.0,...,6,4,4,5,3,3.0,3,6,6,4
4,Duke,5,ACC,27-5,4.8,5.0,2.19,2,5,6.0,...,2,3,7,6,4,7.0,6,4,5,5


In [6]:
output_file = 'data/rankings/{}_composite_rankings.clean.csv'.format(season)
df[feature_list].to_csv(output_file, sep='|')