In [1]:
import numpy
import pandas
import re
import os

In [2]:
input_file_name = 'data/rankings/2017_composite_rankings.csv'

def pattern_match(pattern, string):
    return (re.search(pattern, string) is not None)

special_columns = ['Team', 'Rank', 'Conf', 'Record', 'Mean', 'Median', 'St.Dev']

In [3]:
def get_fields(width, line, data_type, n=1):
    data = list()
    for i in range(n):
        y = line[:width]
        # print '"{}"'.format(y)
        z = numpy.nan if y.strip() == '' else data_type(y.strip())
        data.append(z)
        line = line[width:]
        
    return (data, line)

def parse_line(line):
    ranker_width = 4
    section_width = 2
    rank_width = 5
    team_width = 17
    conf_width = 5
    record_width = 7
    team_short_width = 9
    float_width = 6
    float_2_width = 7

    data = list()
    temp_line = line

    # First Block
    temp_data, temp_line = get_fields(ranker_width, temp_line, int, 5)
    data.extend(temp_data)

    temp_data, temp_line = get_fields(section_width, temp_line, str)

    temp_data, temp_line = get_fields(ranker_width, temp_line, int, 5)
    data.extend(temp_data)

    temp_data, temp_line = get_fields(rank_width, temp_line, int)
    data.extend(temp_data)

    temp_data, temp_line = get_fields(team_width, temp_line, str)
    data.extend(temp_data)

    temp_data, temp_line = get_fields(conf_width, temp_line, str)
    data.extend(temp_data)

    temp_data, temp_line = get_fields(record_width, temp_line, str)
    data.extend(temp_data)

    # Blocks 2 through 4
    for i in range(3):
        for j in range(3):
            temp_data, temp_line = get_fields(section_width, temp_line, str)

            temp_data, temp_line = get_fields(ranker_width, temp_line, int, 5)
            data.extend(temp_data)

        temp_data, temp_line = get_fields(rank_width, temp_line, int)
        data.extend(temp_data)

        temp_data, temp_line = get_fields(team_short_width, temp_line, str)
        data.extend(temp_data)

    # Block 5
    for j in range(2):
        temp_data, temp_line = get_fields(section_width, temp_line, str)

        temp_data, temp_line = get_fields(ranker_width, temp_line, int, 5)
        data.extend(temp_data)

    temp_data, temp_line = get_fields(section_width, temp_line, str)

    temp_data, temp_line = get_fields(ranker_width, temp_line, int, 4)
    data.extend(temp_data)

    temp_data, temp_line = get_fields(section_width, temp_line, str)

    temp_data, temp_line = get_fields(float_width, temp_line, float, 2)
    data.extend(temp_data)

    temp_data, temp_line = get_fields(float_2_width, temp_line, float)
    data.extend(temp_data)

    # print zip(header[:len(data)], data)
    # print temp_line
    return data

In [7]:
with open(input_file_name, 'r') as input_file:
    for line_number, line in enumerate(input_file):
        if line_number == 0:
            header = map(lambda s: s.strip().strip(','), line.split())
            df_header = list()
            for f in header:
                if f not in df_header:
                    df_header.append(f)
            df_dict = dict([(f, list()) for f in df_header])
            continue

        # skip empty lines
        if line.strip() == '':
            continue
            
        # Check for a duplicate header line
        duplicate_header = map(lambda s: s.strip().strip(','), line.split())
        if header == duplicate_header:
            continue

        data = parse_line(line)
        recorded = list()
        for f, x in zip(header, data):
            if f not in recorded:
                df_dict[f].append(x)
                recorded.append(f)
                
df = pandas.DataFrame(df_dict)

ranker_list = sorted(list(set(df.columns) - set(special_columns)))
feature_list = list(special_columns) + ranker_list

for ranker in ranker_list:
    df[ranker] = df[ranker].fillna(df['Median'])

df[feature_list][:5]

Unnamed: 0,Team,Rank,Conf,Record,Mean,Median,St.Dev,7OT,ADE,AP,...,STH,TPR,TRK,TRP,USA,WIL,WLK,WOB,WOL,ZAM
0,Villanova,1,BE,28-3,2.17,2.0,0.88,3,2,2.0,...,1,2,2,3,2.0,1,1,1,3,2
1,Gonzaga,2,WCC,30-1,2.59,1.0,2.92,1,3,4.0,...,3,1,1,1,4.0,3,2,2,1,3
2,Kansas,3,B12,28-3,4.4,3.0,3.04,12,1,1.0,...,5,5,10,6,1.0,2,5,3,2,8
3,North Carolina,4,ACC,26-6,5.05,5.0,2.42,7,6,6.0,...,6,4,7,2,6.0,4,3,6,8,4
4,Kentucky,5,SEC,26-5,6.03,5.0,2.28,8,4,8.0,...,4,6,8,5,8.0,5,4,5,6,7


In [8]:
output_file = 'data/rankings/2017_composite_rankings.clean.csv'
df[feature_list].to_csv(output_file, sep='|')