Skip to content

Commit

Permalink
Merge pull request #87 from pvanheus/code_reformat
Browse files Browse the repository at this point in the history
Reformat all code (using CLion)
  • Loading branch information
andrewjpage committed Nov 11, 2019
2 parents f5ca054 + b8afb96 commit a274ec4
Show file tree
Hide file tree
Showing 20 changed files with 1,141 additions and 1,064 deletions.
8 changes: 4 additions & 4 deletions .travis.yml
Expand Up @@ -5,10 +5,10 @@ compiler:
addons:
apt:
packages:
- check
- check
before_install:
- sudo apt-get update -qq
script:
- autoreconf -i && ./configure --enable-maintainer-mode CFLAGS='-O0 --coverage' && make && make check
- sudo apt-get update -qq
script:
- autoreconf -i && ./configure --enable-maintainer-mode CFLAGS='-O0 --coverage' && make && make check
after_success:
- bash <(curl -s https://codecov.io/bash)
301 changes: 147 additions & 154 deletions src/alignment-file.c
Expand Up @@ -34,9 +34,9 @@ KSEQ_INIT(gzFile, gzread)
int length_of_genome;
int number_of_samples;
int number_of_snps;
char ** sequence_names;
int * snp_locations;
char * pseudo_reference_sequence;
char **sequence_names;
int *snp_locations;
char *pseudo_reference_sequence;

int get_length_of_genome()
{
Expand All @@ -53,189 +53,182 @@ int get_number_of_snps()
return number_of_snps;
}

char ** get_sequence_names()
char **get_sequence_names()
{
return sequence_names;
}

int * get_snp_locations()
int *get_snp_locations()
{
return snp_locations;
}

char * get_pseudo_reference_sequence()
char *get_pseudo_reference_sequence()
{
return pseudo_reference_sequence;
}

void get_bases_for_each_snp(char filename[], char ** bases_for_snps)
void get_bases_for_each_snp(char filename[], char **bases_for_snps)
{
int l;
int i = 0;
int sequence_number = 0;
size_t length_of_genome_found =0;

gzFile fp;
kseq_t *seq;

fp = gzopen(filename, "r");
seq = kseq_init(fp);

while ((l = kseq_read(seq)) >= 0)
{
if(sequence_number == 0)
{
length_of_genome_found = seq->seq.l;
}
for(i = 0; i< number_of_snps; i++)
{
bases_for_snps[i][sequence_number] = toupper(((char *) seq->seq.s)[snp_locations[i]]);
}

if(seq->seq.l != length_of_genome_found)
{
fprintf(stderr, "Alignment %s contains sequences of unequal length. Expected length is %i but got %i in sequence %s\n\n",filename, (int) length_of_genome_found, (int) seq->seq.l,seq->name.s);
fflush(stderr);
exit(EXIT_FAILURE);
}
sequence_number++;
int l;
int i = 0;
int sequence_number = 0;
size_t length_of_genome_found = 0;

gzFile fp;
kseq_t *seq;

fp = gzopen(filename, "r");
seq = kseq_init(fp);

while ((l = kseq_read(seq)) >= 0) {
if (sequence_number == 0) {
length_of_genome_found = seq->seq.l;
}
for (i = 0; i < number_of_snps; i++) {
bases_for_snps[i][sequence_number] = toupper(((char *) seq->seq.s)[snp_locations[i]]);
}

if (seq->seq.l != length_of_genome_found) {
fprintf(stderr,
"Alignment %s contains sequences of unequal length. Expected length is %i but got %i in sequence %s\n\n",
filename, (int) length_of_genome_found, (int) seq->seq.l, seq->name.s);
fflush(stderr);
exit(EXIT_FAILURE);
}
sequence_number++;
}

kseq_destroy(seq);
gzclose(fp);
kseq_destroy(seq);
gzclose(fp);
}

void detect_snps(char filename[], int pure_mode, int output_monomorphic) {
void detect_snps(char filename[], int pure_mode, int output_monomorphic)
{
detect_snps_count_constant_sites(filename, pure_mode, output_monomorphic, NULL);
}

void detect_snps_count_constant_sites(char filename[], int pure_mode, int output_monomorphic, int* constant_site_counts)
void
detect_snps_count_constant_sites(char filename[], int pure_mode, int output_monomorphic, int *constant_site_counts)
{
int i;
int l;
number_of_snps = 0;
number_of_samples = 0;
length_of_genome = 0;
char * first_sequence;
/* array below allows quick mapping of A, C, T and G characters to indices in base_counts array */
const int char_to_base_count_index[] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, 1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3};
int i;
int l;
number_of_snps = 0;
number_of_samples = 0;
length_of_genome = 0;
char *first_sequence;
/* array below allows quick mapping of A, C, T and G characters to indices in base_counts array */
const int char_to_base_count_index[] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, 0, -1, 1, -1, -1, -1, 2, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, 3};


gzFile fp;
kseq_t *seq;

fp = gzopen(filename, "r");
seq = kseq_init(fp);

sequence_names = calloc(DEFAULT_NUM_SAMPLES, sizeof(char*));

while ((l = kseq_read(seq)) >= 0) {
if(number_of_samples == 0)
{
length_of_genome = seq->seq.l;
first_sequence = calloc(length_of_genome + 1, sizeof(char));
pseudo_reference_sequence = calloc(length_of_genome + 1, sizeof(char));

memset(first_sequence, 'N', length_of_genome);
memset(pseudo_reference_sequence, 'N', length_of_genome);
kseq_t *seq;

fp = gzopen(filename, "r");
seq = kseq_init(fp);

sequence_names = calloc(DEFAULT_NUM_SAMPLES, sizeof(char *));

while ((l = kseq_read(seq)) >= 0) {
if (number_of_samples == 0) {
length_of_genome = seq->seq.l;
first_sequence = calloc(length_of_genome + 1, sizeof(char));
pseudo_reference_sequence = calloc(length_of_genome + 1, sizeof(char));

memset(first_sequence, 'N', length_of_genome);
memset(pseudo_reference_sequence, 'N', length_of_genome);
}
for (i = 0; i < length_of_genome; i++) {
if (first_sequence[i] == '#') {
continue;
}

if (first_sequence[i] == 'N' && !is_unknown(seq->seq.s[i])) {
first_sequence[i] = toupper(seq->seq.s[i]);
pseudo_reference_sequence[i] = toupper(seq->seq.s[i]);
}

// in pure mode we only want /ACGT/i, if any other base is found the whole column is excluded
if (pure_mode && !is_pure(seq->seq.s[i])) {
first_sequence[i] = '#';
continue;
}

if (first_sequence[i] != '>' && !is_unknown(seq->seq.s[i]) && first_sequence[i] != 'N' &&
first_sequence[i] != toupper(seq->seq.s[i])) {
first_sequence[i] = '>';
}
}

if (number_of_samples >= DEFAULT_NUM_SAMPLES) {
sequence_names = realloc(sequence_names, (number_of_samples + 1) * sizeof(char *));
}
sequence_names[number_of_samples] = calloc(MAX_SAMPLE_NAME_SIZE, sizeof(char));
strcpy(sequence_names[number_of_samples], seq->name.s);

number_of_samples++;
}

for (i = 0; i < length_of_genome; i++) {
if (first_sequence[i] == '>' || (output_monomorphic && first_sequence[i] != '#')) {
number_of_snps++;
}
}

if (number_of_snps == 0) {
fprintf(stderr, "Warning: No SNPs were detected so there is nothing to output.\n");
fflush(stderr);
exit(EXIT_FAILURE);
}
for(i = 0; i < length_of_genome; i++)
{
if(first_sequence[i] == '#')
{
continue;
}

if(first_sequence[i] == 'N' && !is_unknown(seq->seq.s[i]))
{
first_sequence[i] = toupper(seq->seq.s[i]);
pseudo_reference_sequence[i] = toupper(seq->seq.s[i]);
}

// in pure mode we only want /ACGT/i, if any other base is found the whole column is excluded
if(pure_mode && !is_pure(seq->seq.s[i]))
{
first_sequence[i] = '#';
continue;
}

if(first_sequence[i] != '>' && !is_unknown(seq->seq.s[i]) && first_sequence[i] != 'N' && first_sequence[i] != toupper(seq->seq.s[i]))
{
first_sequence[i] = '>';
}
}

if(number_of_samples >= DEFAULT_NUM_SAMPLES)
{
sequence_names = realloc(sequence_names, (number_of_samples + 1) * sizeof(char*));
}
sequence_names[number_of_samples] = calloc(MAX_SAMPLE_NAME_SIZE,sizeof(char));
strcpy(sequence_names[number_of_samples], seq->name.s);

number_of_samples++;
}

for(i = 0; i < length_of_genome; i++)
{
if(first_sequence[i] == '>' || (output_monomorphic && first_sequence[i] != '#'))
{
number_of_snps++;

int current_snp_index = 0;
snp_locations = calloc(number_of_snps + 1, sizeof(int));
for (i = 0; i < length_of_genome; i++) {
if (first_sequence[i] == '>' || (output_monomorphic && first_sequence[i] != '#')) {
snp_locations[current_snp_index] = i;
current_snp_index++;
} else if (constant_site_counts != NULL && is_pure(first_sequence[i])) {
constant_site_counts[char_to_base_count_index[(int) toupper(first_sequence[i])]]++;
}

}
}

if(number_of_snps == 0)
{
fprintf(stderr, "Warning: No SNPs were detected so there is nothing to output.\n");
fflush(stderr);
exit(EXIT_FAILURE);
}

int current_snp_index = 0;
snp_locations = calloc(number_of_snps +1, sizeof(int));
for(i = 0; i < length_of_genome; i++)
{
if(first_sequence[i] == '>' || (output_monomorphic && first_sequence[i] != '#'))
{
snp_locations[current_snp_index] = i;
current_snp_index++;
} else if (constant_site_counts != NULL && is_pure(first_sequence[i])) {
constant_site_counts[char_to_base_count_index[(int) toupper(first_sequence[i])]]++;
}

}

free(first_sequence);
kseq_destroy(seq);
gzclose(fp);
return;

free(first_sequence);
kseq_destroy(seq);
gzclose(fp);
return;
}

int is_unknown(char base)
{
switch (base) {
case 'N':
case 'n':
case '-':
case '?':
return 1;
default:
return 0;
}
switch (base) {
case 'N':
case 'n':
case '-':
case '?':
return 1;
default:
return 0;
}
}

int is_pure(char base)
{
switch (base) {
case 'A':
case 'C':
case 'G':
case 'T':
case 'a':
case 'c':
case 'g':
case 't':
return 1;
default:
return 0;
}
switch (base) {
case 'A':
case 'C':
case 'G':
case 'T':
case 'a':
case 'c':
case 'g':
case 't':
return 1;
default:
return 0;
}
}
23 changes: 17 additions & 6 deletions src/alignment-file.h
Expand Up @@ -22,16 +22,27 @@

#include "kseq.h"

void detect_snps( char filename[], int pure_mode, int output_monomorphic);
void detect_snps_count_constant_sites(char filename[], int pure_mode, int output_monomorphic, int *constant_site_counts);
void get_bases_for_each_snp(char filename[], char ** bases_for_snps);
void detect_snps(char filename[], int pure_mode, int output_monomorphic);

void
detect_snps_count_constant_sites(char filename[], int pure_mode, int output_monomorphic, int *constant_site_counts);

void get_bases_for_each_snp(char filename[], char **bases_for_snps);

int is_unknown(char base);

int get_length_of_genome();

int get_number_of_samples();

int get_number_of_snps();
char ** get_sequence_names();
int * get_snp_locations();
char * get_pseudo_reference_sequence();

char **get_sequence_names();

int *get_snp_locations();

char *get_pseudo_reference_sequence();

int is_pure(char base);

#define MAX_SAMPLE_NAME_SIZE 2048
Expand Down

0 comments on commit a274ec4

Please sign in to comment.